{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 500, "global_step": 4777120, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010466557256254814, "grad_norm": 3.64605450630188, "learning_rate": 2.4950000000000003e-06, "loss": 9.597, "step": 500 }, { "epoch": 0.002093311451250963, "grad_norm": 4.192243576049805, "learning_rate": 4.9950000000000005e-06, "loss": 9.4989, "step": 1000 }, { "epoch": 0.0031399671768764445, "grad_norm": 3.556063652038574, "learning_rate": 7.495e-06, "loss": 9.3844, "step": 1500 }, { "epoch": 0.004186622902501926, "grad_norm": 2.6045753955841064, "learning_rate": 9.995e-06, "loss": 9.2284, "step": 2000 }, { "epoch": 0.005233278628127407, "grad_norm": 2.502204418182373, "learning_rate": 1.2495000000000001e-05, "loss": 9.0133, "step": 2500 }, { "epoch": 0.006279934353752889, "grad_norm": 2.399658441543579, "learning_rate": 1.4995000000000001e-05, "loss": 8.7677, "step": 3000 }, { "epoch": 0.00732659007937837, "grad_norm": 2.006645917892456, "learning_rate": 1.7495e-05, "loss": 8.5087, "step": 3500 }, { "epoch": 0.008373245805003851, "grad_norm": 1.5697647333145142, "learning_rate": 1.9995e-05, "loss": 8.2816, "step": 4000 }, { "epoch": 0.009419901530629333, "grad_norm": 2.135687828063965, "learning_rate": 2.2495e-05, "loss": 8.0969, "step": 4500 }, { "epoch": 0.010466557256254814, "grad_norm": 1.897276520729065, "learning_rate": 2.4995e-05, "loss": 7.981, "step": 5000 }, { "epoch": 0.011513212981880296, "grad_norm": 1.990171194076538, "learning_rate": 2.7495000000000004e-05, "loss": 7.8855, "step": 5500 }, { "epoch": 0.012559868707505778, "grad_norm": 3.2215583324432373, "learning_rate": 2.9995e-05, "loss": 7.8214, "step": 6000 }, { "epoch": 0.013606524433131258, "grad_norm": 2.996917486190796, "learning_rate": 3.2495000000000007e-05, "loss": 7.7409, "step": 6500 }, { "epoch": 0.01465318015875674, "grad_norm": 2.1247940063476562, "learning_rate": 3.4995e-05, "loss": 7.683, "step": 7000 }, { "epoch": 0.015699835884382222, "grad_norm": 2.905912399291992, "learning_rate": 3.7495e-05, "loss": 7.6109, "step": 7500 }, { "epoch": 0.016746491610007703, "grad_norm": 2.9821040630340576, "learning_rate": 3.9995000000000006e-05, "loss": 7.5443, "step": 8000 }, { "epoch": 0.017793147335633183, "grad_norm": 2.297912120819092, "learning_rate": 4.2495e-05, "loss": 7.511, "step": 8500 }, { "epoch": 0.018839803061258667, "grad_norm": 2.093825578689575, "learning_rate": 4.4995000000000005e-05, "loss": 7.465, "step": 9000 }, { "epoch": 0.019886458786884147, "grad_norm": 2.5167033672332764, "learning_rate": 4.7495e-05, "loss": 7.4333, "step": 9500 }, { "epoch": 0.020933114512509628, "grad_norm": 2.7264904975891113, "learning_rate": 4.9995000000000005e-05, "loss": 7.3925, "step": 10000 }, { "epoch": 0.02197977023813511, "grad_norm": 2.9200127124786377, "learning_rate": 4.9994766232022685e-05, "loss": 7.3478, "step": 10500 }, { "epoch": 0.023026425963760592, "grad_norm": 3.4177768230438232, "learning_rate": 4.99895219755324e-05, "loss": 7.306, "step": 11000 }, { "epoch": 0.024073081689386072, "grad_norm": 3.2038590908050537, "learning_rate": 4.998427771904211e-05, "loss": 7.2644, "step": 11500 }, { "epoch": 0.025119737415011556, "grad_norm": 3.7542872428894043, "learning_rate": 4.997903346255182e-05, "loss": 7.21, "step": 12000 }, { "epoch": 0.026166393140637036, "grad_norm": 4.456035137176514, "learning_rate": 4.997378920606152e-05, "loss": 7.1672, "step": 12500 }, { "epoch": 0.027213048866262517, "grad_norm": 3.7280547618865967, "learning_rate": 4.9968544949571234e-05, "loss": 7.101, "step": 13000 }, { "epoch": 0.028259704591888, "grad_norm": 3.350606679916382, "learning_rate": 4.996330069308094e-05, "loss": 7.0326, "step": 13500 }, { "epoch": 0.02930636031751348, "grad_norm": 5.404280662536621, "learning_rate": 4.995805643659065e-05, "loss": 6.9654, "step": 14000 }, { "epoch": 0.03035301604313896, "grad_norm": 4.390084266662598, "learning_rate": 4.995281218010035e-05, "loss": 6.8913, "step": 14500 }, { "epoch": 0.031399671768764445, "grad_norm": 4.314885139465332, "learning_rate": 4.9947567923610064e-05, "loss": 6.8056, "step": 15000 }, { "epoch": 0.03244632749438992, "grad_norm": 4.049686908721924, "learning_rate": 4.9942323667119775e-05, "loss": 6.7646, "step": 15500 }, { "epoch": 0.033492983220015406, "grad_norm": 3.932662010192871, "learning_rate": 4.9937079410629487e-05, "loss": 6.6845, "step": 16000 }, { "epoch": 0.03453963894564089, "grad_norm": 5.159965515136719, "learning_rate": 4.993183515413919e-05, "loss": 6.6359, "step": 16500 }, { "epoch": 0.035586294671266366, "grad_norm": 4.517904281616211, "learning_rate": 4.9926590897648895e-05, "loss": 6.5778, "step": 17000 }, { "epoch": 0.03663295039689185, "grad_norm": 4.751391887664795, "learning_rate": 4.9921346641158606e-05, "loss": 6.512, "step": 17500 }, { "epoch": 0.037679606122517334, "grad_norm": 4.5365376472473145, "learning_rate": 4.991610238466831e-05, "loss": 6.4786, "step": 18000 }, { "epoch": 0.03872626184814281, "grad_norm": 4.895058631896973, "learning_rate": 4.991085812817802e-05, "loss": 6.418, "step": 18500 }, { "epoch": 0.039772917573768295, "grad_norm": 4.625970363616943, "learning_rate": 4.9905613871687726e-05, "loss": 6.3548, "step": 19000 }, { "epoch": 0.04081957329939378, "grad_norm": 4.2448625564575195, "learning_rate": 4.990036961519744e-05, "loss": 6.3197, "step": 19500 }, { "epoch": 0.041866229025019255, "grad_norm": 4.296064376831055, "learning_rate": 4.989512535870715e-05, "loss": 6.2686, "step": 20000 }, { "epoch": 0.04291288475064474, "grad_norm": 4.193862438201904, "learning_rate": 4.988988110221686e-05, "loss": 6.2422, "step": 20500 }, { "epoch": 0.04395954047627022, "grad_norm": 4.207557201385498, "learning_rate": 4.9884636845726563e-05, "loss": 6.1784, "step": 21000 }, { "epoch": 0.0450061962018957, "grad_norm": 4.527471542358398, "learning_rate": 4.9879392589236275e-05, "loss": 6.1328, "step": 21500 }, { "epoch": 0.046052851927521184, "grad_norm": 5.341414928436279, "learning_rate": 4.987414833274598e-05, "loss": 6.0943, "step": 22000 }, { "epoch": 0.04709950765314667, "grad_norm": 4.897065162658691, "learning_rate": 4.986890407625568e-05, "loss": 6.034, "step": 22500 }, { "epoch": 0.048146163378772144, "grad_norm": 4.786191940307617, "learning_rate": 4.9863659819765394e-05, "loss": 6.0005, "step": 23000 }, { "epoch": 0.04919281910439763, "grad_norm": 4.633001327514648, "learning_rate": 4.98584155632751e-05, "loss": 5.9536, "step": 23500 }, { "epoch": 0.05023947483002311, "grad_norm": 5.17041015625, "learning_rate": 4.985317130678481e-05, "loss": 5.929, "step": 24000 }, { "epoch": 0.05128613055564859, "grad_norm": 5.260965824127197, "learning_rate": 4.984792705029452e-05, "loss": 5.8862, "step": 24500 }, { "epoch": 0.05233278628127407, "grad_norm": 5.115767955780029, "learning_rate": 4.984268279380423e-05, "loss": 5.8588, "step": 25000 }, { "epoch": 0.053379442006899556, "grad_norm": 4.97722053527832, "learning_rate": 4.9837438537313936e-05, "loss": 5.8227, "step": 25500 }, { "epoch": 0.05442609773252503, "grad_norm": 5.624172687530518, "learning_rate": 4.983219428082365e-05, "loss": 5.7951, "step": 26000 }, { "epoch": 0.05547275345815052, "grad_norm": 6.413504600524902, "learning_rate": 4.982695002433335e-05, "loss": 5.7602, "step": 26500 }, { "epoch": 0.056519409183776, "grad_norm": 5.101490497589111, "learning_rate": 4.982170576784306e-05, "loss": 5.6993, "step": 27000 }, { "epoch": 0.05756606490940148, "grad_norm": 6.028198719024658, "learning_rate": 4.981646151135277e-05, "loss": 5.6738, "step": 27500 }, { "epoch": 0.05861272063502696, "grad_norm": 4.985255241394043, "learning_rate": 4.981121725486247e-05, "loss": 5.6618, "step": 28000 }, { "epoch": 0.059659376360652445, "grad_norm": 5.79319429397583, "learning_rate": 4.980597299837219e-05, "loss": 5.6215, "step": 28500 }, { "epoch": 0.06070603208627792, "grad_norm": 5.603042125701904, "learning_rate": 4.980072874188189e-05, "loss": 5.579, "step": 29000 }, { "epoch": 0.061752687811903406, "grad_norm": 5.896328926086426, "learning_rate": 4.9795484485391604e-05, "loss": 5.5555, "step": 29500 }, { "epoch": 0.06279934353752889, "grad_norm": 5.602357387542725, "learning_rate": 4.979024022890131e-05, "loss": 5.5315, "step": 30000 }, { "epoch": 0.06384599926315437, "grad_norm": 5.236395359039307, "learning_rate": 4.978499597241102e-05, "loss": 5.5108, "step": 30500 }, { "epoch": 0.06489265498877984, "grad_norm": 5.643289089202881, "learning_rate": 4.9779751715920724e-05, "loss": 5.4698, "step": 31000 }, { "epoch": 0.06593931071440533, "grad_norm": 6.202617645263672, "learning_rate": 4.9774507459430435e-05, "loss": 5.4308, "step": 31500 }, { "epoch": 0.06698596644003081, "grad_norm": 6.030649185180664, "learning_rate": 4.976926320294014e-05, "loss": 5.4321, "step": 32000 }, { "epoch": 0.0680326221656563, "grad_norm": 5.539003849029541, "learning_rate": 4.976401894644985e-05, "loss": 5.4004, "step": 32500 }, { "epoch": 0.06907927789128178, "grad_norm": 6.096409797668457, "learning_rate": 4.975877468995956e-05, "loss": 5.3789, "step": 33000 }, { "epoch": 0.07012593361690726, "grad_norm": 5.048076629638672, "learning_rate": 4.9753530433469266e-05, "loss": 5.3502, "step": 33500 }, { "epoch": 0.07117258934253273, "grad_norm": 5.414154529571533, "learning_rate": 4.974828617697898e-05, "loss": 5.3094, "step": 34000 }, { "epoch": 0.07221924506815822, "grad_norm": 6.408080577850342, "learning_rate": 4.974304192048868e-05, "loss": 5.3189, "step": 34500 }, { "epoch": 0.0732659007937837, "grad_norm": 5.208996772766113, "learning_rate": 4.973779766399839e-05, "loss": 5.2679, "step": 35000 }, { "epoch": 0.07431255651940918, "grad_norm": 5.5665082931518555, "learning_rate": 4.97325534075081e-05, "loss": 5.2709, "step": 35500 }, { "epoch": 0.07535921224503467, "grad_norm": 6.81998348236084, "learning_rate": 4.972730915101781e-05, "loss": 5.2434, "step": 36000 }, { "epoch": 0.07640586797066015, "grad_norm": 6.66893196105957, "learning_rate": 4.972206489452751e-05, "loss": 5.2051, "step": 36500 }, { "epoch": 0.07745252369628562, "grad_norm": 5.65146541595459, "learning_rate": 4.971682063803722e-05, "loss": 5.1963, "step": 37000 }, { "epoch": 0.0784991794219111, "grad_norm": 6.458414554595947, "learning_rate": 4.9711576381546934e-05, "loss": 5.1673, "step": 37500 }, { "epoch": 0.07954583514753659, "grad_norm": 5.945553302764893, "learning_rate": 4.9706332125056645e-05, "loss": 5.1446, "step": 38000 }, { "epoch": 0.08059249087316207, "grad_norm": 6.245899200439453, "learning_rate": 4.970108786856635e-05, "loss": 5.1111, "step": 38500 }, { "epoch": 0.08163914659878756, "grad_norm": 6.133769512176514, "learning_rate": 4.9695843612076054e-05, "loss": 5.0977, "step": 39000 }, { "epoch": 0.08268580232441304, "grad_norm": 7.222833633422852, "learning_rate": 4.9690599355585765e-05, "loss": 5.0677, "step": 39500 }, { "epoch": 0.08373245805003851, "grad_norm": 6.079624652862549, "learning_rate": 4.968535509909547e-05, "loss": 5.0499, "step": 40000 }, { "epoch": 0.084779113775664, "grad_norm": 6.004246711730957, "learning_rate": 4.968011084260518e-05, "loss": 5.0602, "step": 40500 }, { "epoch": 0.08582576950128948, "grad_norm": 5.876736640930176, "learning_rate": 4.9674866586114885e-05, "loss": 5.0569, "step": 41000 }, { "epoch": 0.08687242522691496, "grad_norm": 7.276224613189697, "learning_rate": 4.9669622329624596e-05, "loss": 5.0083, "step": 41500 }, { "epoch": 0.08791908095254045, "grad_norm": 6.691667079925537, "learning_rate": 4.966437807313431e-05, "loss": 5.0058, "step": 42000 }, { "epoch": 0.08896573667816593, "grad_norm": 6.350453853607178, "learning_rate": 4.965913381664402e-05, "loss": 4.9726, "step": 42500 }, { "epoch": 0.0900123924037914, "grad_norm": 6.684863090515137, "learning_rate": 4.965388956015372e-05, "loss": 4.9765, "step": 43000 }, { "epoch": 0.09105904812941688, "grad_norm": 6.460486888885498, "learning_rate": 4.964864530366343e-05, "loss": 4.9466, "step": 43500 }, { "epoch": 0.09210570385504237, "grad_norm": 5.529162406921387, "learning_rate": 4.964340104717314e-05, "loss": 4.9157, "step": 44000 }, { "epoch": 0.09315235958066785, "grad_norm": 6.254817962646484, "learning_rate": 4.963815679068285e-05, "loss": 4.9062, "step": 44500 }, { "epoch": 0.09419901530629333, "grad_norm": 6.990886688232422, "learning_rate": 4.963291253419255e-05, "loss": 4.8854, "step": 45000 }, { "epoch": 0.09524567103191882, "grad_norm": 6.865322113037109, "learning_rate": 4.962766827770226e-05, "loss": 4.8952, "step": 45500 }, { "epoch": 0.09629232675754429, "grad_norm": 7.7921271324157715, "learning_rate": 4.962242402121197e-05, "loss": 4.8569, "step": 46000 }, { "epoch": 0.09733898248316977, "grad_norm": 9.084587097167969, "learning_rate": 4.961717976472168e-05, "loss": 4.847, "step": 46500 }, { "epoch": 0.09838563820879526, "grad_norm": 6.629016876220703, "learning_rate": 4.961193550823139e-05, "loss": 4.8297, "step": 47000 }, { "epoch": 0.09943229393442074, "grad_norm": 7.885711193084717, "learning_rate": 4.9606691251741095e-05, "loss": 4.801, "step": 47500 }, { "epoch": 0.10047894966004622, "grad_norm": 7.069065093994141, "learning_rate": 4.9601446995250806e-05, "loss": 4.8077, "step": 48000 }, { "epoch": 0.10152560538567171, "grad_norm": 7.027377128601074, "learning_rate": 4.959620273876051e-05, "loss": 4.7858, "step": 48500 }, { "epoch": 0.10257226111129718, "grad_norm": 6.624495983123779, "learning_rate": 4.959095848227022e-05, "loss": 4.7625, "step": 49000 }, { "epoch": 0.10361891683692266, "grad_norm": 6.6265549659729, "learning_rate": 4.9585714225779926e-05, "loss": 4.7486, "step": 49500 }, { "epoch": 0.10466557256254815, "grad_norm": 7.031954765319824, "learning_rate": 4.958046996928964e-05, "loss": 4.7378, "step": 50000 }, { "epoch": 0.10571222828817363, "grad_norm": 6.984372615814209, "learning_rate": 4.957522571279935e-05, "loss": 4.7243, "step": 50500 }, { "epoch": 0.10675888401379911, "grad_norm": 7.820217609405518, "learning_rate": 4.956998145630905e-05, "loss": 4.6914, "step": 51000 }, { "epoch": 0.1078055397394246, "grad_norm": 6.973567008972168, "learning_rate": 4.956473719981876e-05, "loss": 4.6978, "step": 51500 }, { "epoch": 0.10885219546505007, "grad_norm": 7.373286724090576, "learning_rate": 4.955949294332847e-05, "loss": 4.6877, "step": 52000 }, { "epoch": 0.10989885119067555, "grad_norm": 6.093576431274414, "learning_rate": 4.955424868683818e-05, "loss": 4.6698, "step": 52500 }, { "epoch": 0.11094550691630103, "grad_norm": 6.94814920425415, "learning_rate": 4.954900443034788e-05, "loss": 4.6551, "step": 53000 }, { "epoch": 0.11199216264192652, "grad_norm": 6.788534164428711, "learning_rate": 4.9543760173857594e-05, "loss": 4.6389, "step": 53500 }, { "epoch": 0.113038818367552, "grad_norm": 7.0019731521606445, "learning_rate": 4.95385159173673e-05, "loss": 4.6118, "step": 54000 }, { "epoch": 0.11408547409317749, "grad_norm": 7.307645320892334, "learning_rate": 4.953327166087701e-05, "loss": 4.6111, "step": 54500 }, { "epoch": 0.11513212981880296, "grad_norm": 6.926957130432129, "learning_rate": 4.952802740438672e-05, "loss": 4.6101, "step": 55000 }, { "epoch": 0.11617878554442844, "grad_norm": 7.193602085113525, "learning_rate": 4.952278314789643e-05, "loss": 4.5866, "step": 55500 }, { "epoch": 0.11722544127005392, "grad_norm": 7.387453556060791, "learning_rate": 4.9517538891406136e-05, "loss": 4.5847, "step": 56000 }, { "epoch": 0.11827209699567941, "grad_norm": 7.325709819793701, "learning_rate": 4.951229463491584e-05, "loss": 4.5568, "step": 56500 }, { "epoch": 0.11931875272130489, "grad_norm": 6.833515167236328, "learning_rate": 4.950705037842555e-05, "loss": 4.5619, "step": 57000 }, { "epoch": 0.12036540844693037, "grad_norm": 6.9426350593566895, "learning_rate": 4.9501806121935255e-05, "loss": 4.5285, "step": 57500 }, { "epoch": 0.12141206417255584, "grad_norm": 6.727519512176514, "learning_rate": 4.9496561865444967e-05, "loss": 4.5238, "step": 58000 }, { "epoch": 0.12245871989818133, "grad_norm": 8.102425575256348, "learning_rate": 4.949131760895467e-05, "loss": 4.542, "step": 58500 }, { "epoch": 0.12350537562380681, "grad_norm": 7.100870132446289, "learning_rate": 4.948607335246438e-05, "loss": 4.5028, "step": 59000 }, { "epoch": 0.1245520313494323, "grad_norm": 7.742684841156006, "learning_rate": 4.948082909597409e-05, "loss": 4.4981, "step": 59500 }, { "epoch": 0.12559868707505778, "grad_norm": 7.060272216796875, "learning_rate": 4.9475584839483804e-05, "loss": 4.4922, "step": 60000 }, { "epoch": 0.12664534280068326, "grad_norm": 8.001740455627441, "learning_rate": 4.947034058299351e-05, "loss": 4.4649, "step": 60500 }, { "epoch": 0.12769199852630875, "grad_norm": 8.404182434082031, "learning_rate": 4.946509632650322e-05, "loss": 4.4587, "step": 61000 }, { "epoch": 0.12873865425193423, "grad_norm": 7.447577953338623, "learning_rate": 4.9459852070012924e-05, "loss": 4.4515, "step": 61500 }, { "epoch": 0.1297853099775597, "grad_norm": 8.091418266296387, "learning_rate": 4.945460781352263e-05, "loss": 4.4381, "step": 62000 }, { "epoch": 0.13083196570318517, "grad_norm": 7.1255202293396, "learning_rate": 4.944936355703234e-05, "loss": 4.4441, "step": 62500 }, { "epoch": 0.13187862142881066, "grad_norm": 8.02684211730957, "learning_rate": 4.9444119300542043e-05, "loss": 4.4215, "step": 63000 }, { "epoch": 0.13292527715443614, "grad_norm": 7.683554172515869, "learning_rate": 4.9438875044051755e-05, "loss": 4.4327, "step": 63500 }, { "epoch": 0.13397193288006162, "grad_norm": 8.080080032348633, "learning_rate": 4.9433630787561466e-05, "loss": 4.4068, "step": 64000 }, { "epoch": 0.1350185886056871, "grad_norm": 7.458099365234375, "learning_rate": 4.942838653107118e-05, "loss": 4.4201, "step": 64500 }, { "epoch": 0.1360652443313126, "grad_norm": 7.462032794952393, "learning_rate": 4.942314227458088e-05, "loss": 4.3637, "step": 65000 }, { "epoch": 0.13711190005693807, "grad_norm": 7.472281455993652, "learning_rate": 4.941789801809059e-05, "loss": 4.351, "step": 65500 }, { "epoch": 0.13815855578256356, "grad_norm": 7.934845924377441, "learning_rate": 4.9412653761600296e-05, "loss": 4.3533, "step": 66000 }, { "epoch": 0.13920521150818904, "grad_norm": 7.513514041900635, "learning_rate": 4.940740950511001e-05, "loss": 4.356, "step": 66500 }, { "epoch": 0.14025186723381453, "grad_norm": 8.049979209899902, "learning_rate": 4.940216524861971e-05, "loss": 4.364, "step": 67000 }, { "epoch": 0.14129852295944, "grad_norm": 7.7549543380737305, "learning_rate": 4.939692099212942e-05, "loss": 4.3392, "step": 67500 }, { "epoch": 0.14234517868506547, "grad_norm": 7.425003528594971, "learning_rate": 4.9391676735639134e-05, "loss": 4.3209, "step": 68000 }, { "epoch": 0.14339183441069095, "grad_norm": 8.367003440856934, "learning_rate": 4.938643247914884e-05, "loss": 4.3028, "step": 68500 }, { "epoch": 0.14443849013631643, "grad_norm": 8.997605323791504, "learning_rate": 4.938118822265855e-05, "loss": 4.2991, "step": 69000 }, { "epoch": 0.14548514586194192, "grad_norm": 9.737692832946777, "learning_rate": 4.9375943966168254e-05, "loss": 4.2749, "step": 69500 }, { "epoch": 0.1465318015875674, "grad_norm": 7.125947952270508, "learning_rate": 4.9370699709677965e-05, "loss": 4.3031, "step": 70000 }, { "epoch": 0.14757845731319288, "grad_norm": 8.609292984008789, "learning_rate": 4.936545545318767e-05, "loss": 4.2763, "step": 70500 }, { "epoch": 0.14862511303881837, "grad_norm": 8.131288528442383, "learning_rate": 4.936021119669738e-05, "loss": 4.273, "step": 71000 }, { "epoch": 0.14967176876444385, "grad_norm": 7.420558452606201, "learning_rate": 4.9354966940207084e-05, "loss": 4.2634, "step": 71500 }, { "epoch": 0.15071842449006934, "grad_norm": 7.5696024894714355, "learning_rate": 4.9349722683716795e-05, "loss": 4.2394, "step": 72000 }, { "epoch": 0.15176508021569482, "grad_norm": 8.54397964477539, "learning_rate": 4.9344478427226506e-05, "loss": 4.2408, "step": 72500 }, { "epoch": 0.1528117359413203, "grad_norm": 9.530181884765625, "learning_rate": 4.933923417073621e-05, "loss": 4.2436, "step": 73000 }, { "epoch": 0.1538583916669458, "grad_norm": 8.272136688232422, "learning_rate": 4.933398991424592e-05, "loss": 4.2482, "step": 73500 }, { "epoch": 0.15490504739257124, "grad_norm": 8.339431762695312, "learning_rate": 4.9328745657755626e-05, "loss": 4.2098, "step": 74000 }, { "epoch": 0.15595170311819673, "grad_norm": 10.416884422302246, "learning_rate": 4.932350140126534e-05, "loss": 4.2143, "step": 74500 }, { "epoch": 0.1569983588438222, "grad_norm": 7.975149631500244, "learning_rate": 4.931825714477504e-05, "loss": 4.1971, "step": 75000 }, { "epoch": 0.1580450145694477, "grad_norm": 8.726923942565918, "learning_rate": 4.931301288828475e-05, "loss": 4.1842, "step": 75500 }, { "epoch": 0.15909167029507318, "grad_norm": 8.041524887084961, "learning_rate": 4.930776863179446e-05, "loss": 4.1787, "step": 76000 }, { "epoch": 0.16013832602069866, "grad_norm": 7.997225761413574, "learning_rate": 4.930252437530417e-05, "loss": 4.1901, "step": 76500 }, { "epoch": 0.16118498174632415, "grad_norm": 8.139989852905273, "learning_rate": 4.929728011881388e-05, "loss": 4.1766, "step": 77000 }, { "epoch": 0.16223163747194963, "grad_norm": 9.176040649414062, "learning_rate": 4.929203586232359e-05, "loss": 4.1607, "step": 77500 }, { "epoch": 0.16327829319757511, "grad_norm": 9.098502159118652, "learning_rate": 4.9286791605833294e-05, "loss": 4.149, "step": 78000 }, { "epoch": 0.1643249489232006, "grad_norm": 8.84504222869873, "learning_rate": 4.9281547349343006e-05, "loss": 4.158, "step": 78500 }, { "epoch": 0.16537160464882608, "grad_norm": 8.766018867492676, "learning_rate": 4.927630309285271e-05, "loss": 4.1337, "step": 79000 }, { "epoch": 0.16641826037445157, "grad_norm": 8.32721996307373, "learning_rate": 4.9271058836362414e-05, "loss": 4.1325, "step": 79500 }, { "epoch": 0.16746491610007702, "grad_norm": 7.921020984649658, "learning_rate": 4.9265814579872125e-05, "loss": 4.121, "step": 80000 }, { "epoch": 0.1685115718257025, "grad_norm": 8.113018989562988, "learning_rate": 4.926057032338183e-05, "loss": 4.1175, "step": 80500 }, { "epoch": 0.169558227551328, "grad_norm": 7.923780918121338, "learning_rate": 4.925532606689154e-05, "loss": 4.1183, "step": 81000 }, { "epoch": 0.17060488327695347, "grad_norm": 8.166654586791992, "learning_rate": 4.925008181040125e-05, "loss": 4.0867, "step": 81500 }, { "epoch": 0.17165153900257896, "grad_norm": 9.584228515625, "learning_rate": 4.924483755391096e-05, "loss": 4.1038, "step": 82000 }, { "epoch": 0.17269819472820444, "grad_norm": 9.009160995483398, "learning_rate": 4.923959329742067e-05, "loss": 4.0804, "step": 82500 }, { "epoch": 0.17374485045382992, "grad_norm": 8.721781730651855, "learning_rate": 4.923434904093038e-05, "loss": 4.0819, "step": 83000 }, { "epoch": 0.1747915061794554, "grad_norm": 8.227258682250977, "learning_rate": 4.922910478444008e-05, "loss": 4.0708, "step": 83500 }, { "epoch": 0.1758381619050809, "grad_norm": 10.180132865905762, "learning_rate": 4.9223860527949794e-05, "loss": 4.0517, "step": 84000 }, { "epoch": 0.17688481763070638, "grad_norm": 9.789459228515625, "learning_rate": 4.92186162714595e-05, "loss": 4.0458, "step": 84500 }, { "epoch": 0.17793147335633186, "grad_norm": 8.524105072021484, "learning_rate": 4.921337201496921e-05, "loss": 4.0491, "step": 85000 }, { "epoch": 0.17897812908195734, "grad_norm": 10.976552963256836, "learning_rate": 4.920812775847892e-05, "loss": 4.0615, "step": 85500 }, { "epoch": 0.1800247848075828, "grad_norm": 9.248417854309082, "learning_rate": 4.9202883501988624e-05, "loss": 4.0377, "step": 86000 }, { "epoch": 0.18107144053320828, "grad_norm": 9.169407844543457, "learning_rate": 4.9197639245498335e-05, "loss": 4.032, "step": 86500 }, { "epoch": 0.18211809625883377, "grad_norm": 9.867403984069824, "learning_rate": 4.919239498900804e-05, "loss": 4.0252, "step": 87000 }, { "epoch": 0.18316475198445925, "grad_norm": 9.492571830749512, "learning_rate": 4.918715073251775e-05, "loss": 4.0157, "step": 87500 }, { "epoch": 0.18421140771008473, "grad_norm": 12.658378601074219, "learning_rate": 4.9181906476027455e-05, "loss": 4.0326, "step": 88000 }, { "epoch": 0.18525806343571022, "grad_norm": 10.190011024475098, "learning_rate": 4.9176662219537166e-05, "loss": 3.9885, "step": 88500 }, { "epoch": 0.1863047191613357, "grad_norm": 8.705041885375977, "learning_rate": 4.917141796304687e-05, "loss": 4.0138, "step": 89000 }, { "epoch": 0.1873513748869612, "grad_norm": 8.16767692565918, "learning_rate": 4.916617370655658e-05, "loss": 3.9872, "step": 89500 }, { "epoch": 0.18839803061258667, "grad_norm": 9.612553596496582, "learning_rate": 4.916092945006629e-05, "loss": 3.9753, "step": 90000 }, { "epoch": 0.18944468633821215, "grad_norm": 8.702152252197266, "learning_rate": 4.9155685193576e-05, "loss": 3.9983, "step": 90500 }, { "epoch": 0.19049134206383764, "grad_norm": 8.502490997314453, "learning_rate": 4.915044093708571e-05, "loss": 3.9726, "step": 91000 }, { "epoch": 0.1915379977894631, "grad_norm": 8.386590003967285, "learning_rate": 4.914519668059541e-05, "loss": 3.9766, "step": 91500 }, { "epoch": 0.19258465351508858, "grad_norm": 11.067843437194824, "learning_rate": 4.9139952424105123e-05, "loss": 3.9692, "step": 92000 }, { "epoch": 0.19363130924071406, "grad_norm": 8.627717971801758, "learning_rate": 4.913470816761483e-05, "loss": 3.9705, "step": 92500 }, { "epoch": 0.19467796496633955, "grad_norm": 8.876980781555176, "learning_rate": 4.912946391112454e-05, "loss": 3.9514, "step": 93000 }, { "epoch": 0.19572462069196503, "grad_norm": 8.704911231994629, "learning_rate": 4.912421965463424e-05, "loss": 3.9487, "step": 93500 }, { "epoch": 0.1967712764175905, "grad_norm": 9.181367874145508, "learning_rate": 4.9118975398143954e-05, "loss": 3.9625, "step": 94000 }, { "epoch": 0.197817932143216, "grad_norm": 9.26285457611084, "learning_rate": 4.9113731141653665e-05, "loss": 3.9384, "step": 94500 }, { "epoch": 0.19886458786884148, "grad_norm": 9.374444961547852, "learning_rate": 4.9108486885163376e-05, "loss": 3.9235, "step": 95000 }, { "epoch": 0.19991124359446696, "grad_norm": 8.755977630615234, "learning_rate": 4.910324262867308e-05, "loss": 3.9324, "step": 95500 }, { "epoch": 0.20095789932009245, "grad_norm": 8.30579662322998, "learning_rate": 4.9097998372182785e-05, "loss": 3.9126, "step": 96000 }, { "epoch": 0.20200455504571793, "grad_norm": 10.230141639709473, "learning_rate": 4.9092754115692496e-05, "loss": 3.9143, "step": 96500 }, { "epoch": 0.20305121077134342, "grad_norm": 8.642914772033691, "learning_rate": 4.90875098592022e-05, "loss": 3.9251, "step": 97000 }, { "epoch": 0.20409786649696887, "grad_norm": 7.933784008026123, "learning_rate": 4.908226560271191e-05, "loss": 3.8883, "step": 97500 }, { "epoch": 0.20514452222259436, "grad_norm": 8.96338939666748, "learning_rate": 4.9077021346221616e-05, "loss": 3.8981, "step": 98000 }, { "epoch": 0.20619117794821984, "grad_norm": 9.458770751953125, "learning_rate": 4.907177708973133e-05, "loss": 3.9017, "step": 98500 }, { "epoch": 0.20723783367384532, "grad_norm": 8.5197114944458, "learning_rate": 4.906653283324104e-05, "loss": 3.8879, "step": 99000 }, { "epoch": 0.2082844893994708, "grad_norm": 8.986029624938965, "learning_rate": 4.906128857675075e-05, "loss": 3.8499, "step": 99500 }, { "epoch": 0.2093311451250963, "grad_norm": 9.112171173095703, "learning_rate": 4.905604432026045e-05, "loss": 3.8868, "step": 100000 }, { "epoch": 0.21037780085072177, "grad_norm": 9.110477447509766, "learning_rate": 4.9050800063770164e-05, "loss": 3.8697, "step": 100500 }, { "epoch": 0.21142445657634726, "grad_norm": 9.7176513671875, "learning_rate": 4.904555580727987e-05, "loss": 3.8755, "step": 101000 }, { "epoch": 0.21247111230197274, "grad_norm": 8.108610153198242, "learning_rate": 4.904031155078957e-05, "loss": 3.8889, "step": 101500 }, { "epoch": 0.21351776802759823, "grad_norm": 8.106103897094727, "learning_rate": 4.9035067294299284e-05, "loss": 3.8707, "step": 102000 }, { "epoch": 0.2145644237532237, "grad_norm": 11.612373352050781, "learning_rate": 4.9029823037808995e-05, "loss": 3.8488, "step": 102500 }, { "epoch": 0.2156110794788492, "grad_norm": 9.667598724365234, "learning_rate": 4.9024578781318706e-05, "loss": 3.836, "step": 103000 }, { "epoch": 0.21665773520447465, "grad_norm": 11.72484302520752, "learning_rate": 4.901933452482841e-05, "loss": 3.8451, "step": 103500 }, { "epoch": 0.21770439093010013, "grad_norm": 10.606128692626953, "learning_rate": 4.901409026833812e-05, "loss": 3.8472, "step": 104000 }, { "epoch": 0.21875104665572562, "grad_norm": 10.606013298034668, "learning_rate": 4.9008846011847826e-05, "loss": 3.8359, "step": 104500 }, { "epoch": 0.2197977023813511, "grad_norm": 8.975906372070312, "learning_rate": 4.900360175535754e-05, "loss": 3.836, "step": 105000 }, { "epoch": 0.22084435810697658, "grad_norm": 9.780426979064941, "learning_rate": 4.899835749886724e-05, "loss": 3.8411, "step": 105500 }, { "epoch": 0.22189101383260207, "grad_norm": 10.327115058898926, "learning_rate": 4.899311324237695e-05, "loss": 3.8211, "step": 106000 }, { "epoch": 0.22293766955822755, "grad_norm": 8.694579124450684, "learning_rate": 4.8987868985886657e-05, "loss": 3.8064, "step": 106500 }, { "epoch": 0.22398432528385304, "grad_norm": 9.27409553527832, "learning_rate": 4.898262472939637e-05, "loss": 3.8239, "step": 107000 }, { "epoch": 0.22503098100947852, "grad_norm": 9.751458168029785, "learning_rate": 4.897738047290608e-05, "loss": 3.7966, "step": 107500 }, { "epoch": 0.226077636735104, "grad_norm": 9.185708999633789, "learning_rate": 4.897213621641578e-05, "loss": 3.822, "step": 108000 }, { "epoch": 0.2271242924607295, "grad_norm": 9.227031707763672, "learning_rate": 4.8966891959925494e-05, "loss": 3.8041, "step": 108500 }, { "epoch": 0.22817094818635497, "grad_norm": 9.26705551147461, "learning_rate": 4.89616477034352e-05, "loss": 3.7975, "step": 109000 }, { "epoch": 0.22921760391198043, "grad_norm": 10.14479923248291, "learning_rate": 4.895640344694491e-05, "loss": 3.7963, "step": 109500 }, { "epoch": 0.2302642596376059, "grad_norm": 9.646303176879883, "learning_rate": 4.8951159190454614e-05, "loss": 3.8238, "step": 110000 }, { "epoch": 0.2313109153632314, "grad_norm": 10.834391593933105, "learning_rate": 4.8945914933964325e-05, "loss": 3.7812, "step": 110500 }, { "epoch": 0.23235757108885688, "grad_norm": 9.785489082336426, "learning_rate": 4.894067067747403e-05, "loss": 3.7814, "step": 111000 }, { "epoch": 0.23340422681448236, "grad_norm": 8.428281784057617, "learning_rate": 4.893542642098374e-05, "loss": 3.7703, "step": 111500 }, { "epoch": 0.23445088254010785, "grad_norm": 10.469707489013672, "learning_rate": 4.893018216449345e-05, "loss": 3.7675, "step": 112000 }, { "epoch": 0.23549753826573333, "grad_norm": 9.148385047912598, "learning_rate": 4.892493790800316e-05, "loss": 3.7591, "step": 112500 }, { "epoch": 0.23654419399135881, "grad_norm": 9.309948921203613, "learning_rate": 4.891969365151287e-05, "loss": 3.7725, "step": 113000 }, { "epoch": 0.2375908497169843, "grad_norm": 10.725934982299805, "learning_rate": 4.891444939502257e-05, "loss": 3.7649, "step": 113500 }, { "epoch": 0.23863750544260978, "grad_norm": 10.073323249816895, "learning_rate": 4.890920513853228e-05, "loss": 3.7657, "step": 114000 }, { "epoch": 0.23968416116823527, "grad_norm": 8.668704986572266, "learning_rate": 4.8903960882041986e-05, "loss": 3.7622, "step": 114500 }, { "epoch": 0.24073081689386075, "grad_norm": 11.145539283752441, "learning_rate": 4.88987166255517e-05, "loss": 3.7421, "step": 115000 }, { "epoch": 0.2417774726194862, "grad_norm": 10.671344757080078, "learning_rate": 4.88934723690614e-05, "loss": 3.7576, "step": 115500 }, { "epoch": 0.2428241283451117, "grad_norm": 9.717750549316406, "learning_rate": 4.888822811257111e-05, "loss": 3.7501, "step": 116000 }, { "epoch": 0.24387078407073717, "grad_norm": 10.641368865966797, "learning_rate": 4.8882983856080824e-05, "loss": 3.7422, "step": 116500 }, { "epoch": 0.24491743979636266, "grad_norm": 8.914629936218262, "learning_rate": 4.8877739599590535e-05, "loss": 3.7541, "step": 117000 }, { "epoch": 0.24596409552198814, "grad_norm": 10.46849250793457, "learning_rate": 4.887249534310024e-05, "loss": 3.7364, "step": 117500 }, { "epoch": 0.24701075124761362, "grad_norm": 9.60447883605957, "learning_rate": 4.886725108660995e-05, "loss": 3.7467, "step": 118000 }, { "epoch": 0.2480574069732391, "grad_norm": 10.49460220336914, "learning_rate": 4.8862006830119655e-05, "loss": 3.712, "step": 118500 }, { "epoch": 0.2491040626988646, "grad_norm": 14.539194107055664, "learning_rate": 4.885676257362936e-05, "loss": 3.7165, "step": 119000 }, { "epoch": 0.25015071842449005, "grad_norm": 10.182270050048828, "learning_rate": 4.885151831713907e-05, "loss": 3.7123, "step": 119500 }, { "epoch": 0.25119737415011556, "grad_norm": 9.267268180847168, "learning_rate": 4.884627406064878e-05, "loss": 3.6992, "step": 120000 }, { "epoch": 0.252244029875741, "grad_norm": 9.057978630065918, "learning_rate": 4.884102980415849e-05, "loss": 3.7385, "step": 120500 }, { "epoch": 0.2532906856013665, "grad_norm": 10.707968711853027, "learning_rate": 4.8835785547668197e-05, "loss": 3.7208, "step": 121000 }, { "epoch": 0.254337341326992, "grad_norm": 11.377409934997559, "learning_rate": 4.883054129117791e-05, "loss": 3.7112, "step": 121500 }, { "epoch": 0.2553839970526175, "grad_norm": 10.746428489685059, "learning_rate": 4.882529703468761e-05, "loss": 3.7166, "step": 122000 }, { "epoch": 0.25643065277824295, "grad_norm": 9.995687484741211, "learning_rate": 4.882005277819732e-05, "loss": 3.7102, "step": 122500 }, { "epoch": 0.25747730850386846, "grad_norm": 9.406068801879883, "learning_rate": 4.881480852170703e-05, "loss": 3.7102, "step": 123000 }, { "epoch": 0.2585239642294939, "grad_norm": 10.425297737121582, "learning_rate": 4.880956426521674e-05, "loss": 3.7146, "step": 123500 }, { "epoch": 0.2595706199551194, "grad_norm": 11.773101806640625, "learning_rate": 4.880432000872644e-05, "loss": 3.6811, "step": 124000 }, { "epoch": 0.2606172756807449, "grad_norm": 9.578620910644531, "learning_rate": 4.8799075752236154e-05, "loss": 3.6813, "step": 124500 }, { "epoch": 0.26166393140637034, "grad_norm": 12.10355281829834, "learning_rate": 4.8793831495745865e-05, "loss": 3.6693, "step": 125000 }, { "epoch": 0.26271058713199585, "grad_norm": 18.296689987182617, "learning_rate": 4.878858723925557e-05, "loss": 3.6812, "step": 125500 }, { "epoch": 0.2637572428576213, "grad_norm": 10.175383567810059, "learning_rate": 4.878334298276528e-05, "loss": 3.7081, "step": 126000 }, { "epoch": 0.2648038985832468, "grad_norm": 11.165489196777344, "learning_rate": 4.8778098726274985e-05, "loss": 3.6976, "step": 126500 }, { "epoch": 0.2658505543088723, "grad_norm": 9.288297653198242, "learning_rate": 4.8772854469784696e-05, "loss": 3.6887, "step": 127000 }, { "epoch": 0.2668972100344978, "grad_norm": 17.841209411621094, "learning_rate": 4.87676102132944e-05, "loss": 3.6844, "step": 127500 }, { "epoch": 0.26794386576012325, "grad_norm": 10.734357833862305, "learning_rate": 4.876236595680411e-05, "loss": 3.6791, "step": 128000 }, { "epoch": 0.26899052148574876, "grad_norm": 10.153436660766602, "learning_rate": 4.8757121700313815e-05, "loss": 3.6674, "step": 128500 }, { "epoch": 0.2700371772113742, "grad_norm": 10.22332763671875, "learning_rate": 4.8751877443823526e-05, "loss": 3.6713, "step": 129000 }, { "epoch": 0.2710838329369997, "grad_norm": 14.261969566345215, "learning_rate": 4.874663318733324e-05, "loss": 3.6695, "step": 129500 }, { "epoch": 0.2721304886626252, "grad_norm": 9.342558860778809, "learning_rate": 4.874138893084294e-05, "loss": 3.6645, "step": 130000 }, { "epoch": 0.27317714438825064, "grad_norm": 9.66645336151123, "learning_rate": 4.873614467435265e-05, "loss": 3.6626, "step": 130500 }, { "epoch": 0.27422380011387615, "grad_norm": 11.663922309875488, "learning_rate": 4.873090041786236e-05, "loss": 3.6748, "step": 131000 }, { "epoch": 0.2752704558395016, "grad_norm": 15.538830757141113, "learning_rate": 4.872565616137207e-05, "loss": 3.6678, "step": 131500 }, { "epoch": 0.2763171115651271, "grad_norm": 12.959735870361328, "learning_rate": 4.872041190488177e-05, "loss": 3.6665, "step": 132000 }, { "epoch": 0.27736376729075257, "grad_norm": 11.006184577941895, "learning_rate": 4.8715167648391484e-05, "loss": 3.6447, "step": 132500 }, { "epoch": 0.2784104230163781, "grad_norm": 9.448165893554688, "learning_rate": 4.870992339190119e-05, "loss": 3.6431, "step": 133000 }, { "epoch": 0.27945707874200354, "grad_norm": 9.94456958770752, "learning_rate": 4.87046791354109e-05, "loss": 3.6366, "step": 133500 }, { "epoch": 0.28050373446762905, "grad_norm": 9.225470542907715, "learning_rate": 4.869943487892061e-05, "loss": 3.6425, "step": 134000 }, { "epoch": 0.2815503901932545, "grad_norm": 12.220582008361816, "learning_rate": 4.869419062243032e-05, "loss": 3.6439, "step": 134500 }, { "epoch": 0.28259704591888, "grad_norm": 14.487257957458496, "learning_rate": 4.8688946365940025e-05, "loss": 3.6345, "step": 135000 }, { "epoch": 0.2836437016445055, "grad_norm": 11.964679718017578, "learning_rate": 4.868370210944973e-05, "loss": 3.6218, "step": 135500 }, { "epoch": 0.28469035737013093, "grad_norm": 11.698953628540039, "learning_rate": 4.867845785295944e-05, "loss": 3.6404, "step": 136000 }, { "epoch": 0.28573701309575644, "grad_norm": 9.862428665161133, "learning_rate": 4.8673213596469145e-05, "loss": 3.6417, "step": 136500 }, { "epoch": 0.2867836688213819, "grad_norm": 11.150898933410645, "learning_rate": 4.8667969339978856e-05, "loss": 3.6498, "step": 137000 }, { "epoch": 0.2878303245470074, "grad_norm": 10.551894187927246, "learning_rate": 4.866272508348857e-05, "loss": 3.6192, "step": 137500 }, { "epoch": 0.28887698027263287, "grad_norm": 11.98263168334961, "learning_rate": 4.865748082699828e-05, "loss": 3.6386, "step": 138000 }, { "epoch": 0.2899236359982584, "grad_norm": 9.15615177154541, "learning_rate": 4.865223657050798e-05, "loss": 3.6312, "step": 138500 }, { "epoch": 0.29097029172388383, "grad_norm": 10.159266471862793, "learning_rate": 4.8646992314017694e-05, "loss": 3.6295, "step": 139000 }, { "epoch": 0.29201694744950935, "grad_norm": 10.220767974853516, "learning_rate": 4.86417480575274e-05, "loss": 3.6179, "step": 139500 }, { "epoch": 0.2930636031751348, "grad_norm": 8.701539039611816, "learning_rate": 4.863650380103711e-05, "loss": 3.6145, "step": 140000 }, { "epoch": 0.2941102589007603, "grad_norm": 9.564008712768555, "learning_rate": 4.8631259544546813e-05, "loss": 3.6122, "step": 140500 }, { "epoch": 0.29515691462638577, "grad_norm": 12.48365592956543, "learning_rate": 4.862601528805652e-05, "loss": 3.6173, "step": 141000 }, { "epoch": 0.2962035703520113, "grad_norm": 11.763916969299316, "learning_rate": 4.862077103156623e-05, "loss": 3.597, "step": 141500 }, { "epoch": 0.29725022607763674, "grad_norm": 9.560352325439453, "learning_rate": 4.861552677507594e-05, "loss": 3.6046, "step": 142000 }, { "epoch": 0.2982968818032622, "grad_norm": 12.799619674682617, "learning_rate": 4.861028251858565e-05, "loss": 3.5927, "step": 142500 }, { "epoch": 0.2993435375288877, "grad_norm": 11.397905349731445, "learning_rate": 4.8605038262095355e-05, "loss": 3.6023, "step": 143000 }, { "epoch": 0.30039019325451316, "grad_norm": 10.616772651672363, "learning_rate": 4.8599794005605066e-05, "loss": 3.5975, "step": 143500 }, { "epoch": 0.30143684898013867, "grad_norm": 9.623900413513184, "learning_rate": 4.859454974911477e-05, "loss": 3.5898, "step": 144000 }, { "epoch": 0.30248350470576413, "grad_norm": 10.935953140258789, "learning_rate": 4.858930549262448e-05, "loss": 3.5799, "step": 144500 }, { "epoch": 0.30353016043138964, "grad_norm": 10.900083541870117, "learning_rate": 4.8584061236134186e-05, "loss": 3.6053, "step": 145000 }, { "epoch": 0.3045768161570151, "grad_norm": 10.620619773864746, "learning_rate": 4.85788169796439e-05, "loss": 3.6079, "step": 145500 }, { "epoch": 0.3056234718826406, "grad_norm": 11.077310562133789, "learning_rate": 4.85735727231536e-05, "loss": 3.5776, "step": 146000 }, { "epoch": 0.30667012760826606, "grad_norm": 10.007762908935547, "learning_rate": 4.856832846666331e-05, "loss": 3.5638, "step": 146500 }, { "epoch": 0.3077167833338916, "grad_norm": 10.598138809204102, "learning_rate": 4.8563084210173024e-05, "loss": 3.583, "step": 147000 }, { "epoch": 0.30876343905951703, "grad_norm": 11.144253730773926, "learning_rate": 4.855783995368273e-05, "loss": 3.5848, "step": 147500 }, { "epoch": 0.3098100947851425, "grad_norm": 11.306970596313477, "learning_rate": 4.855259569719244e-05, "loss": 3.5941, "step": 148000 }, { "epoch": 0.310856750510768, "grad_norm": 9.877154350280762, "learning_rate": 4.854735144070214e-05, "loss": 3.563, "step": 148500 }, { "epoch": 0.31190340623639345, "grad_norm": 10.749898910522461, "learning_rate": 4.8542107184211854e-05, "loss": 3.5749, "step": 149000 }, { "epoch": 0.31295006196201897, "grad_norm": 15.005173683166504, "learning_rate": 4.853686292772156e-05, "loss": 3.5749, "step": 149500 }, { "epoch": 0.3139967176876444, "grad_norm": 9.143765449523926, "learning_rate": 4.853161867123127e-05, "loss": 3.5652, "step": 150000 }, { "epoch": 0.31504337341326993, "grad_norm": 9.946343421936035, "learning_rate": 4.8526374414740974e-05, "loss": 3.5668, "step": 150500 }, { "epoch": 0.3160900291388954, "grad_norm": 10.508331298828125, "learning_rate": 4.8521130158250685e-05, "loss": 3.5462, "step": 151000 }, { "epoch": 0.3171366848645209, "grad_norm": 10.284043312072754, "learning_rate": 4.8515885901760396e-05, "loss": 3.5455, "step": 151500 }, { "epoch": 0.31818334059014636, "grad_norm": 10.805180549621582, "learning_rate": 4.851064164527011e-05, "loss": 3.5639, "step": 152000 }, { "epoch": 0.31922999631577187, "grad_norm": 11.65955924987793, "learning_rate": 4.850539738877981e-05, "loss": 3.5582, "step": 152500 }, { "epoch": 0.3202766520413973, "grad_norm": 10.179997444152832, "learning_rate": 4.8500153132289516e-05, "loss": 3.5611, "step": 153000 }, { "epoch": 0.3213233077670228, "grad_norm": 10.81788444519043, "learning_rate": 4.849490887579923e-05, "loss": 3.5747, "step": 153500 }, { "epoch": 0.3223699634926483, "grad_norm": 10.119538307189941, "learning_rate": 4.848966461930893e-05, "loss": 3.5355, "step": 154000 }, { "epoch": 0.32341661921827375, "grad_norm": 11.75223445892334, "learning_rate": 4.848442036281864e-05, "loss": 3.5598, "step": 154500 }, { "epoch": 0.32446327494389926, "grad_norm": 10.74061393737793, "learning_rate": 4.847917610632835e-05, "loss": 3.5501, "step": 155000 }, { "epoch": 0.3255099306695247, "grad_norm": 10.233543395996094, "learning_rate": 4.8473931849838065e-05, "loss": 3.5191, "step": 155500 }, { "epoch": 0.32655658639515023, "grad_norm": 10.875897407531738, "learning_rate": 4.846868759334777e-05, "loss": 3.5507, "step": 156000 }, { "epoch": 0.3276032421207757, "grad_norm": 10.219343185424805, "learning_rate": 4.846344333685748e-05, "loss": 3.5364, "step": 156500 }, { "epoch": 0.3286498978464012, "grad_norm": 11.428631782531738, "learning_rate": 4.8458199080367184e-05, "loss": 3.5301, "step": 157000 }, { "epoch": 0.32969655357202665, "grad_norm": 10.078638076782227, "learning_rate": 4.8452954823876895e-05, "loss": 3.5255, "step": 157500 }, { "epoch": 0.33074320929765216, "grad_norm": 9.928384780883789, "learning_rate": 4.84477105673866e-05, "loss": 3.5453, "step": 158000 }, { "epoch": 0.3317898650232776, "grad_norm": 10.03620433807373, "learning_rate": 4.8442466310896304e-05, "loss": 3.5262, "step": 158500 }, { "epoch": 0.33283652074890313, "grad_norm": 11.071606636047363, "learning_rate": 4.8437222054406015e-05, "loss": 3.5194, "step": 159000 }, { "epoch": 0.3338831764745286, "grad_norm": 10.378827095031738, "learning_rate": 4.8431977797915726e-05, "loss": 3.5118, "step": 159500 }, { "epoch": 0.33492983220015404, "grad_norm": 9.44300651550293, "learning_rate": 4.842673354142544e-05, "loss": 3.5208, "step": 160000 }, { "epoch": 0.33597648792577955, "grad_norm": 10.029437065124512, "learning_rate": 4.842148928493514e-05, "loss": 3.5257, "step": 160500 }, { "epoch": 0.337023143651405, "grad_norm": 9.363441467285156, "learning_rate": 4.841624502844485e-05, "loss": 3.5363, "step": 161000 }, { "epoch": 0.3380697993770305, "grad_norm": 12.595246315002441, "learning_rate": 4.841100077195456e-05, "loss": 3.4989, "step": 161500 }, { "epoch": 0.339116455102656, "grad_norm": 10.906128883361816, "learning_rate": 4.840575651546427e-05, "loss": 3.5011, "step": 162000 }, { "epoch": 0.3401631108282815, "grad_norm": 11.750715255737305, "learning_rate": 4.840051225897397e-05, "loss": 3.5331, "step": 162500 }, { "epoch": 0.34120976655390695, "grad_norm": 10.51074504852295, "learning_rate": 4.839526800248368e-05, "loss": 3.5089, "step": 163000 }, { "epoch": 0.34225642227953246, "grad_norm": 9.795524597167969, "learning_rate": 4.839002374599339e-05, "loss": 3.5021, "step": 163500 }, { "epoch": 0.3433030780051579, "grad_norm": 11.36467456817627, "learning_rate": 4.83847794895031e-05, "loss": 3.5058, "step": 164000 }, { "epoch": 0.3443497337307834, "grad_norm": 12.384054183959961, "learning_rate": 4.837953523301281e-05, "loss": 3.5049, "step": 164500 }, { "epoch": 0.3453963894564089, "grad_norm": 8.808448791503906, "learning_rate": 4.8374290976522514e-05, "loss": 3.5125, "step": 165000 }, { "epoch": 0.34644304518203434, "grad_norm": 12.16399097442627, "learning_rate": 4.8369046720032225e-05, "loss": 3.4815, "step": 165500 }, { "epoch": 0.34748970090765985, "grad_norm": 12.59363079071045, "learning_rate": 4.836380246354193e-05, "loss": 3.4936, "step": 166000 }, { "epoch": 0.3485363566332853, "grad_norm": 11.870656967163086, "learning_rate": 4.835855820705164e-05, "loss": 3.4914, "step": 166500 }, { "epoch": 0.3495830123589108, "grad_norm": 14.470786094665527, "learning_rate": 4.8353313950561345e-05, "loss": 3.5026, "step": 167000 }, { "epoch": 0.3506296680845363, "grad_norm": 13.976658821105957, "learning_rate": 4.8348069694071056e-05, "loss": 3.4868, "step": 167500 }, { "epoch": 0.3516763238101618, "grad_norm": 10.48547649383545, "learning_rate": 4.834282543758076e-05, "loss": 3.475, "step": 168000 }, { "epoch": 0.35272297953578724, "grad_norm": 10.239022254943848, "learning_rate": 4.833758118109047e-05, "loss": 3.4873, "step": 168500 }, { "epoch": 0.35376963526141275, "grad_norm": 11.403185844421387, "learning_rate": 4.833233692460018e-05, "loss": 3.4797, "step": 169000 }, { "epoch": 0.3548162909870382, "grad_norm": 10.561127662658691, "learning_rate": 4.832709266810989e-05, "loss": 3.4801, "step": 169500 }, { "epoch": 0.3558629467126637, "grad_norm": 12.317484855651855, "learning_rate": 4.83218484116196e-05, "loss": 3.4764, "step": 170000 }, { "epoch": 0.3569096024382892, "grad_norm": 10.917819023132324, "learning_rate": 4.83166041551293e-05, "loss": 3.4656, "step": 170500 }, { "epoch": 0.3579562581639147, "grad_norm": 10.05319881439209, "learning_rate": 4.831135989863901e-05, "loss": 3.4883, "step": 171000 }, { "epoch": 0.35900291388954014, "grad_norm": 10.580421447753906, "learning_rate": 4.830611564214872e-05, "loss": 3.4748, "step": 171500 }, { "epoch": 0.3600495696151656, "grad_norm": 10.05270004272461, "learning_rate": 4.830087138565843e-05, "loss": 3.4759, "step": 172000 }, { "epoch": 0.3610962253407911, "grad_norm": 9.610018730163574, "learning_rate": 4.829562712916813e-05, "loss": 3.4808, "step": 172500 }, { "epoch": 0.36214288106641657, "grad_norm": 9.96599292755127, "learning_rate": 4.829038287267785e-05, "loss": 3.4708, "step": 173000 }, { "epoch": 0.3631895367920421, "grad_norm": 11.62179183959961, "learning_rate": 4.8285138616187555e-05, "loss": 3.4529, "step": 173500 }, { "epoch": 0.36423619251766753, "grad_norm": 13.293895721435547, "learning_rate": 4.8279894359697266e-05, "loss": 3.4675, "step": 174000 }, { "epoch": 0.36528284824329305, "grad_norm": 11.011073112487793, "learning_rate": 4.827465010320697e-05, "loss": 3.4774, "step": 174500 }, { "epoch": 0.3663295039689185, "grad_norm": 11.709229469299316, "learning_rate": 4.826940584671668e-05, "loss": 3.4715, "step": 175000 }, { "epoch": 0.367376159694544, "grad_norm": 22.54070472717285, "learning_rate": 4.8264161590226386e-05, "loss": 3.4692, "step": 175500 }, { "epoch": 0.36842281542016947, "grad_norm": 10.65783977508545, "learning_rate": 4.825891733373609e-05, "loss": 3.4607, "step": 176000 }, { "epoch": 0.369469471145795, "grad_norm": 10.615188598632812, "learning_rate": 4.82536730772458e-05, "loss": 3.4679, "step": 176500 }, { "epoch": 0.37051612687142044, "grad_norm": 10.126315116882324, "learning_rate": 4.824842882075551e-05, "loss": 3.4508, "step": 177000 }, { "epoch": 0.3715627825970459, "grad_norm": 11.070189476013184, "learning_rate": 4.824318456426522e-05, "loss": 3.4489, "step": 177500 }, { "epoch": 0.3726094383226714, "grad_norm": 11.450393676757812, "learning_rate": 4.823794030777493e-05, "loss": 3.4655, "step": 178000 }, { "epoch": 0.37365609404829686, "grad_norm": 10.161665916442871, "learning_rate": 4.823269605128464e-05, "loss": 3.4589, "step": 178500 }, { "epoch": 0.3747027497739224, "grad_norm": 12.312939643859863, "learning_rate": 4.822745179479434e-05, "loss": 3.4598, "step": 179000 }, { "epoch": 0.37574940549954783, "grad_norm": 10.091519355773926, "learning_rate": 4.8222207538304054e-05, "loss": 3.4448, "step": 179500 }, { "epoch": 0.37679606122517334, "grad_norm": 12.645812034606934, "learning_rate": 4.821696328181376e-05, "loss": 3.4326, "step": 180000 }, { "epoch": 0.3778427169507988, "grad_norm": 12.809340476989746, "learning_rate": 4.821171902532347e-05, "loss": 3.45, "step": 180500 }, { "epoch": 0.3788893726764243, "grad_norm": 10.731861114501953, "learning_rate": 4.8206474768833174e-05, "loss": 3.421, "step": 181000 }, { "epoch": 0.37993602840204976, "grad_norm": 10.301986694335938, "learning_rate": 4.8201230512342885e-05, "loss": 3.4611, "step": 181500 }, { "epoch": 0.3809826841276753, "grad_norm": 16.397619247436523, "learning_rate": 4.8195986255852596e-05, "loss": 3.4252, "step": 182000 }, { "epoch": 0.38202933985330073, "grad_norm": 12.400419235229492, "learning_rate": 4.81907419993623e-05, "loss": 3.45, "step": 182500 }, { "epoch": 0.3830759955789262, "grad_norm": 10.512582778930664, "learning_rate": 4.818549774287201e-05, "loss": 3.4241, "step": 183000 }, { "epoch": 0.3841226513045517, "grad_norm": 11.499505996704102, "learning_rate": 4.8180253486381716e-05, "loss": 3.4297, "step": 183500 }, { "epoch": 0.38516930703017715, "grad_norm": 16.747417449951172, "learning_rate": 4.817500922989143e-05, "loss": 3.428, "step": 184000 }, { "epoch": 0.38621596275580267, "grad_norm": 10.71003246307373, "learning_rate": 4.816976497340113e-05, "loss": 3.443, "step": 184500 }, { "epoch": 0.3872626184814281, "grad_norm": 12.24734878540039, "learning_rate": 4.816452071691084e-05, "loss": 3.4337, "step": 185000 }, { "epoch": 0.38830927420705363, "grad_norm": 10.048531532287598, "learning_rate": 4.8159276460420546e-05, "loss": 3.3994, "step": 185500 }, { "epoch": 0.3893559299326791, "grad_norm": 12.772525787353516, "learning_rate": 4.815403220393026e-05, "loss": 3.437, "step": 186000 }, { "epoch": 0.3904025856583046, "grad_norm": 14.685653686523438, "learning_rate": 4.814878794743997e-05, "loss": 3.4133, "step": 186500 }, { "epoch": 0.39144924138393006, "grad_norm": 11.671835899353027, "learning_rate": 4.814354369094967e-05, "loss": 3.4112, "step": 187000 }, { "epoch": 0.39249589710955557, "grad_norm": 11.606546401977539, "learning_rate": 4.8138299434459384e-05, "loss": 3.409, "step": 187500 }, { "epoch": 0.393542552835181, "grad_norm": 11.260401725769043, "learning_rate": 4.813305517796909e-05, "loss": 3.4018, "step": 188000 }, { "epoch": 0.39458920856080654, "grad_norm": 10.123751640319824, "learning_rate": 4.81278109214788e-05, "loss": 3.4108, "step": 188500 }, { "epoch": 0.395635864286432, "grad_norm": 11.894832611083984, "learning_rate": 4.8122566664988504e-05, "loss": 3.406, "step": 189000 }, { "epoch": 0.39668252001205745, "grad_norm": 11.00570011138916, "learning_rate": 4.8117322408498215e-05, "loss": 3.4083, "step": 189500 }, { "epoch": 0.39772917573768296, "grad_norm": 12.74232006072998, "learning_rate": 4.811207815200792e-05, "loss": 3.4116, "step": 190000 }, { "epoch": 0.3987758314633084, "grad_norm": 11.420770645141602, "learning_rate": 4.810683389551763e-05, "loss": 3.4158, "step": 190500 }, { "epoch": 0.39982248718893393, "grad_norm": 16.82096290588379, "learning_rate": 4.810158963902734e-05, "loss": 3.3994, "step": 191000 }, { "epoch": 0.4008691429145594, "grad_norm": 11.060291290283203, "learning_rate": 4.809634538253705e-05, "loss": 3.3834, "step": 191500 }, { "epoch": 0.4019157986401849, "grad_norm": 12.322439193725586, "learning_rate": 4.8091101126046757e-05, "loss": 3.4087, "step": 192000 }, { "epoch": 0.40296245436581035, "grad_norm": 10.809911727905273, "learning_rate": 4.808585686955646e-05, "loss": 3.4216, "step": 192500 }, { "epoch": 0.40400911009143586, "grad_norm": 12.680830955505371, "learning_rate": 4.808061261306617e-05, "loss": 3.4157, "step": 193000 }, { "epoch": 0.4050557658170613, "grad_norm": 10.405234336853027, "learning_rate": 4.8075368356575876e-05, "loss": 3.412, "step": 193500 }, { "epoch": 0.40610242154268683, "grad_norm": 11.325750350952148, "learning_rate": 4.807012410008559e-05, "loss": 3.4002, "step": 194000 }, { "epoch": 0.4071490772683123, "grad_norm": 10.326035499572754, "learning_rate": 4.80648798435953e-05, "loss": 3.389, "step": 194500 }, { "epoch": 0.40819573299393774, "grad_norm": 10.16747760772705, "learning_rate": 4.805963558710501e-05, "loss": 3.4216, "step": 195000 }, { "epoch": 0.40924238871956325, "grad_norm": 11.919361114501953, "learning_rate": 4.8054391330614714e-05, "loss": 3.4115, "step": 195500 }, { "epoch": 0.4102890444451887, "grad_norm": 11.383450508117676, "learning_rate": 4.8049147074124425e-05, "loss": 3.3786, "step": 196000 }, { "epoch": 0.4113357001708142, "grad_norm": 10.544475555419922, "learning_rate": 4.804390281763413e-05, "loss": 3.4085, "step": 196500 }, { "epoch": 0.4123823558964397, "grad_norm": 9.772621154785156, "learning_rate": 4.803865856114384e-05, "loss": 3.4117, "step": 197000 }, { "epoch": 0.4134290116220652, "grad_norm": 10.270927429199219, "learning_rate": 4.8033414304653544e-05, "loss": 3.3872, "step": 197500 }, { "epoch": 0.41447566734769065, "grad_norm": 9.77122688293457, "learning_rate": 4.802817004816325e-05, "loss": 3.3836, "step": 198000 }, { "epoch": 0.41552232307331616, "grad_norm": 11.148908615112305, "learning_rate": 4.802292579167296e-05, "loss": 3.3575, "step": 198500 }, { "epoch": 0.4165689787989416, "grad_norm": 10.737231254577637, "learning_rate": 4.801768153518267e-05, "loss": 3.3981, "step": 199000 }, { "epoch": 0.4176156345245671, "grad_norm": 12.284107208251953, "learning_rate": 4.801243727869238e-05, "loss": 3.4013, "step": 199500 }, { "epoch": 0.4186622902501926, "grad_norm": 10.228964805603027, "learning_rate": 4.8007193022202086e-05, "loss": 3.3815, "step": 200000 }, { "epoch": 0.4197089459758181, "grad_norm": 11.971832275390625, "learning_rate": 4.80019487657118e-05, "loss": 3.383, "step": 200500 }, { "epoch": 0.42075560170144355, "grad_norm": 10.371969223022461, "learning_rate": 4.79967045092215e-05, "loss": 3.3838, "step": 201000 }, { "epoch": 0.421802257427069, "grad_norm": 10.181611061096191, "learning_rate": 4.799146025273121e-05, "loss": 3.3639, "step": 201500 }, { "epoch": 0.4228489131526945, "grad_norm": 11.001091957092285, "learning_rate": 4.798621599624092e-05, "loss": 3.363, "step": 202000 }, { "epoch": 0.42389556887832, "grad_norm": 10.615559577941895, "learning_rate": 4.798097173975063e-05, "loss": 3.4197, "step": 202500 }, { "epoch": 0.4249422246039455, "grad_norm": 10.604320526123047, "learning_rate": 4.797572748326033e-05, "loss": 3.369, "step": 203000 }, { "epoch": 0.42598888032957094, "grad_norm": 10.329935073852539, "learning_rate": 4.7970483226770044e-05, "loss": 3.3586, "step": 203500 }, { "epoch": 0.42703553605519645, "grad_norm": 9.436850547790527, "learning_rate": 4.7965238970279755e-05, "loss": 3.3869, "step": 204000 }, { "epoch": 0.4280821917808219, "grad_norm": 10.6071195602417, "learning_rate": 4.795999471378946e-05, "loss": 3.371, "step": 204500 }, { "epoch": 0.4291288475064474, "grad_norm": 12.931861877441406, "learning_rate": 4.795475045729917e-05, "loss": 3.3731, "step": 205000 }, { "epoch": 0.4301755032320729, "grad_norm": 11.769183158874512, "learning_rate": 4.7949506200808874e-05, "loss": 3.3666, "step": 205500 }, { "epoch": 0.4312221589576984, "grad_norm": 11.604097366333008, "learning_rate": 4.7944261944318585e-05, "loss": 3.3754, "step": 206000 }, { "epoch": 0.43226881468332384, "grad_norm": 10.262664794921875, "learning_rate": 4.793901768782829e-05, "loss": 3.3515, "step": 206500 }, { "epoch": 0.4333154704089493, "grad_norm": 10.124714851379395, "learning_rate": 4.7933773431338e-05, "loss": 3.354, "step": 207000 }, { "epoch": 0.4343621261345748, "grad_norm": 10.745051383972168, "learning_rate": 4.7928529174847705e-05, "loss": 3.3586, "step": 207500 }, { "epoch": 0.43540878186020027, "grad_norm": 11.749173164367676, "learning_rate": 4.7923284918357416e-05, "loss": 3.3627, "step": 208000 }, { "epoch": 0.4364554375858258, "grad_norm": 10.849390983581543, "learning_rate": 4.791804066186713e-05, "loss": 3.3387, "step": 208500 }, { "epoch": 0.43750209331145123, "grad_norm": 10.715561866760254, "learning_rate": 4.791279640537684e-05, "loss": 3.3619, "step": 209000 }, { "epoch": 0.43854874903707675, "grad_norm": 11.967480659484863, "learning_rate": 4.790755214888654e-05, "loss": 3.3529, "step": 209500 }, { "epoch": 0.4395954047627022, "grad_norm": 11.503972053527832, "learning_rate": 4.790230789239625e-05, "loss": 3.3523, "step": 210000 }, { "epoch": 0.4406420604883277, "grad_norm": 9.704139709472656, "learning_rate": 4.789706363590596e-05, "loss": 3.3384, "step": 210500 }, { "epoch": 0.44168871621395317, "grad_norm": 10.480524063110352, "learning_rate": 4.789181937941566e-05, "loss": 3.3308, "step": 211000 }, { "epoch": 0.4427353719395787, "grad_norm": 9.06908130645752, "learning_rate": 4.7886575122925373e-05, "loss": 3.3692, "step": 211500 }, { "epoch": 0.44378202766520414, "grad_norm": 9.806156158447266, "learning_rate": 4.7881330866435084e-05, "loss": 3.3535, "step": 212000 }, { "epoch": 0.44482868339082965, "grad_norm": 11.005589485168457, "learning_rate": 4.7876086609944796e-05, "loss": 3.3419, "step": 212500 }, { "epoch": 0.4458753391164551, "grad_norm": 9.831252098083496, "learning_rate": 4.78708423534545e-05, "loss": 3.3619, "step": 213000 }, { "epoch": 0.44692199484208056, "grad_norm": 10.76080322265625, "learning_rate": 4.786559809696421e-05, "loss": 3.326, "step": 213500 }, { "epoch": 0.4479686505677061, "grad_norm": 9.859101295471191, "learning_rate": 4.7860353840473915e-05, "loss": 3.3092, "step": 214000 }, { "epoch": 0.44901530629333153, "grad_norm": Infinity, "learning_rate": 4.7855109583983626e-05, "loss": 3.341, "step": 214500 }, { "epoch": 0.45006196201895704, "grad_norm": 9.945215225219727, "learning_rate": 4.784986532749333e-05, "loss": 3.3503, "step": 215000 }, { "epoch": 0.4511086177445825, "grad_norm": 9.162121772766113, "learning_rate": 4.7844621071003035e-05, "loss": 3.3258, "step": 215500 }, { "epoch": 0.452155273470208, "grad_norm": 9.615477561950684, "learning_rate": 4.7839376814512746e-05, "loss": 3.3151, "step": 216000 }, { "epoch": 0.45320192919583346, "grad_norm": 14.504148483276367, "learning_rate": 4.783413255802246e-05, "loss": 3.3455, "step": 216500 }, { "epoch": 0.454248584921459, "grad_norm": 10.406500816345215, "learning_rate": 4.782888830153217e-05, "loss": 3.3236, "step": 217000 }, { "epoch": 0.45529524064708443, "grad_norm": 9.952347755432129, "learning_rate": 4.782364404504187e-05, "loss": 3.3307, "step": 217500 }, { "epoch": 0.45634189637270994, "grad_norm": 10.187820434570312, "learning_rate": 4.7818399788551584e-05, "loss": 3.3229, "step": 218000 }, { "epoch": 0.4573885520983354, "grad_norm": 9.727540016174316, "learning_rate": 4.781315553206129e-05, "loss": 3.3298, "step": 218500 }, { "epoch": 0.45843520782396086, "grad_norm": 11.574451446533203, "learning_rate": 4.7807911275571e-05, "loss": 3.3231, "step": 219000 }, { "epoch": 0.45948186354958637, "grad_norm": 11.0931396484375, "learning_rate": 4.78026670190807e-05, "loss": 3.3453, "step": 219500 }, { "epoch": 0.4605285192752118, "grad_norm": 11.656729698181152, "learning_rate": 4.7797422762590414e-05, "loss": 3.3187, "step": 220000 }, { "epoch": 0.46157517500083733, "grad_norm": 10.328500747680664, "learning_rate": 4.779217850610012e-05, "loss": 3.3439, "step": 220500 }, { "epoch": 0.4626218307264628, "grad_norm": 10.2681245803833, "learning_rate": 4.778693424960983e-05, "loss": 3.3194, "step": 221000 }, { "epoch": 0.4636684864520883, "grad_norm": 9.979874610900879, "learning_rate": 4.778168999311954e-05, "loss": 3.3311, "step": 221500 }, { "epoch": 0.46471514217771376, "grad_norm": 10.180366516113281, "learning_rate": 4.7776445736629245e-05, "loss": 3.3151, "step": 222000 }, { "epoch": 0.46576179790333927, "grad_norm": 9.833127975463867, "learning_rate": 4.7771201480138956e-05, "loss": 3.2932, "step": 222500 }, { "epoch": 0.4668084536289647, "grad_norm": 13.276266098022461, "learning_rate": 4.776595722364866e-05, "loss": 3.3201, "step": 223000 }, { "epoch": 0.46785510935459024, "grad_norm": 10.376708984375, "learning_rate": 4.776071296715837e-05, "loss": 3.3219, "step": 223500 }, { "epoch": 0.4689017650802157, "grad_norm": 10.381233215332031, "learning_rate": 4.7755468710668076e-05, "loss": 3.3227, "step": 224000 }, { "epoch": 0.46994842080584115, "grad_norm": 9.938654899597168, "learning_rate": 4.775022445417779e-05, "loss": 3.3302, "step": 224500 }, { "epoch": 0.47099507653146666, "grad_norm": 18.017099380493164, "learning_rate": 4.774498019768749e-05, "loss": 3.309, "step": 225000 }, { "epoch": 0.4720417322570921, "grad_norm": 12.489651679992676, "learning_rate": 4.77397359411972e-05, "loss": 3.304, "step": 225500 }, { "epoch": 0.47308838798271763, "grad_norm": 10.201933860778809, "learning_rate": 4.7734491684706913e-05, "loss": 3.3099, "step": 226000 }, { "epoch": 0.4741350437083431, "grad_norm": 8.419718742370605, "learning_rate": 4.772924742821662e-05, "loss": 3.3089, "step": 226500 }, { "epoch": 0.4751816994339686, "grad_norm": 10.876585006713867, "learning_rate": 4.772400317172633e-05, "loss": 3.3015, "step": 227000 }, { "epoch": 0.47622835515959405, "grad_norm": 12.112421035766602, "learning_rate": 4.771875891523603e-05, "loss": 3.3065, "step": 227500 }, { "epoch": 0.47727501088521956, "grad_norm": 12.425979614257812, "learning_rate": 4.7713514658745744e-05, "loss": 3.2989, "step": 228000 }, { "epoch": 0.478321666610845, "grad_norm": 11.909448623657227, "learning_rate": 4.770827040225545e-05, "loss": 3.306, "step": 228500 }, { "epoch": 0.47936832233647053, "grad_norm": 12.37705135345459, "learning_rate": 4.770302614576516e-05, "loss": 3.2669, "step": 229000 }, { "epoch": 0.480414978062096, "grad_norm": 10.09940242767334, "learning_rate": 4.769778188927487e-05, "loss": 3.2919, "step": 229500 }, { "epoch": 0.4814616337877215, "grad_norm": 10.72971248626709, "learning_rate": 4.769253763278458e-05, "loss": 3.3023, "step": 230000 }, { "epoch": 0.48250828951334696, "grad_norm": 10.56069564819336, "learning_rate": 4.7687293376294286e-05, "loss": 3.2848, "step": 230500 }, { "epoch": 0.4835549452389724, "grad_norm": 11.02773666381836, "learning_rate": 4.7682049119804e-05, "loss": 3.2827, "step": 231000 }, { "epoch": 0.4846016009645979, "grad_norm": 11.52928638458252, "learning_rate": 4.76768048633137e-05, "loss": 3.3157, "step": 231500 }, { "epoch": 0.4856482566902234, "grad_norm": 11.727448463439941, "learning_rate": 4.7671560606823406e-05, "loss": 3.3013, "step": 232000 }, { "epoch": 0.4866949124158489, "grad_norm": 17.037269592285156, "learning_rate": 4.766631635033312e-05, "loss": 3.2609, "step": 232500 }, { "epoch": 0.48774156814147435, "grad_norm": 10.367916107177734, "learning_rate": 4.766107209384282e-05, "loss": 3.2922, "step": 233000 }, { "epoch": 0.48878822386709986, "grad_norm": 12.98807430267334, "learning_rate": 4.765582783735253e-05, "loss": 3.2919, "step": 233500 }, { "epoch": 0.4898348795927253, "grad_norm": 11.527999877929688, "learning_rate": 4.765058358086224e-05, "loss": 3.314, "step": 234000 }, { "epoch": 0.4908815353183508, "grad_norm": 13.287246704101562, "learning_rate": 4.7645339324371954e-05, "loss": 3.302, "step": 234500 }, { "epoch": 0.4919281910439763, "grad_norm": 10.47502613067627, "learning_rate": 4.764009506788166e-05, "loss": 3.309, "step": 235000 }, { "epoch": 0.4929748467696018, "grad_norm": 9.35061264038086, "learning_rate": 4.763485081139137e-05, "loss": 3.2908, "step": 235500 }, { "epoch": 0.49402150249522725, "grad_norm": 10.327001571655273, "learning_rate": 4.7629606554901074e-05, "loss": 3.279, "step": 236000 }, { "epoch": 0.4950681582208527, "grad_norm": 10.865884780883789, "learning_rate": 4.7624362298410785e-05, "loss": 3.2579, "step": 236500 }, { "epoch": 0.4961148139464782, "grad_norm": 11.66104507446289, "learning_rate": 4.761911804192049e-05, "loss": 3.2786, "step": 237000 }, { "epoch": 0.4971614696721037, "grad_norm": 11.577054977416992, "learning_rate": 4.7613873785430194e-05, "loss": 3.2856, "step": 237500 }, { "epoch": 0.4982081253977292, "grad_norm": 10.246381759643555, "learning_rate": 4.7608629528939905e-05, "loss": 3.2788, "step": 238000 }, { "epoch": 0.49925478112335464, "grad_norm": 13.074071884155273, "learning_rate": 4.7603385272449616e-05, "loss": 3.2832, "step": 238500 }, { "epoch": 0.5003014368489801, "grad_norm": 12.01363468170166, "learning_rate": 4.759814101595933e-05, "loss": 3.2916, "step": 239000 }, { "epoch": 0.5013480925746057, "grad_norm": 12.348705291748047, "learning_rate": 4.759289675946903e-05, "loss": 3.2846, "step": 239500 }, { "epoch": 0.5023947483002311, "grad_norm": 10.607982635498047, "learning_rate": 4.758765250297874e-05, "loss": 3.2729, "step": 240000 }, { "epoch": 0.5034414040258566, "grad_norm": 10.155036926269531, "learning_rate": 4.7582408246488447e-05, "loss": 3.2927, "step": 240500 }, { "epoch": 0.504488059751482, "grad_norm": 13.156068801879883, "learning_rate": 4.757716398999816e-05, "loss": 3.278, "step": 241000 }, { "epoch": 0.5055347154771076, "grad_norm": 11.345580101013184, "learning_rate": 4.757191973350786e-05, "loss": 3.2934, "step": 241500 }, { "epoch": 0.506581371202733, "grad_norm": 9.942957878112793, "learning_rate": 4.756667547701757e-05, "loss": 3.2635, "step": 242000 }, { "epoch": 0.5076280269283585, "grad_norm": 10.676426887512207, "learning_rate": 4.756143122052728e-05, "loss": 3.271, "step": 242500 }, { "epoch": 0.508674682653984, "grad_norm": 12.352347373962402, "learning_rate": 4.755618696403699e-05, "loss": 3.2661, "step": 243000 }, { "epoch": 0.5097213383796094, "grad_norm": 10.62816333770752, "learning_rate": 4.75509427075467e-05, "loss": 3.277, "step": 243500 }, { "epoch": 0.510767994105235, "grad_norm": 23.25828742980957, "learning_rate": 4.7545698451056404e-05, "loss": 3.2707, "step": 244000 }, { "epoch": 0.5118146498308604, "grad_norm": 9.740167617797852, "learning_rate": 4.7540454194566115e-05, "loss": 3.2512, "step": 244500 }, { "epoch": 0.5128613055564859, "grad_norm": 11.710044860839844, "learning_rate": 4.753520993807582e-05, "loss": 3.2886, "step": 245000 }, { "epoch": 0.5139079612821114, "grad_norm": 11.449296951293945, "learning_rate": 4.752996568158553e-05, "loss": 3.2592, "step": 245500 }, { "epoch": 0.5149546170077369, "grad_norm": 12.369144439697266, "learning_rate": 4.7524721425095235e-05, "loss": 3.2675, "step": 246000 }, { "epoch": 0.5160012727333624, "grad_norm": 12.875565528869629, "learning_rate": 4.7519477168604946e-05, "loss": 3.2706, "step": 246500 }, { "epoch": 0.5170479284589878, "grad_norm": 11.443960189819336, "learning_rate": 4.751423291211466e-05, "loss": 3.27, "step": 247000 }, { "epoch": 0.5180945841846133, "grad_norm": 10.942846298217773, "learning_rate": 4.750898865562437e-05, "loss": 3.2789, "step": 247500 }, { "epoch": 0.5191412399102387, "grad_norm": 10.40578556060791, "learning_rate": 4.750374439913407e-05, "loss": 3.2753, "step": 248000 }, { "epoch": 0.5201878956358643, "grad_norm": 11.1233491897583, "learning_rate": 4.749850014264378e-05, "loss": 3.2606, "step": 248500 }, { "epoch": 0.5212345513614898, "grad_norm": 13.182182312011719, "learning_rate": 4.749325588615349e-05, "loss": 3.2386, "step": 249000 }, { "epoch": 0.5222812070871152, "grad_norm": 10.928067207336426, "learning_rate": 4.748801162966319e-05, "loss": 3.2637, "step": 249500 }, { "epoch": 0.5233278628127407, "grad_norm": 11.476807594299316, "learning_rate": 4.74827673731729e-05, "loss": 3.2688, "step": 250000 }, { "epoch": 0.5243745185383663, "grad_norm": 15.408782958984375, "learning_rate": 4.747752311668261e-05, "loss": 3.2447, "step": 250500 }, { "epoch": 0.5254211742639917, "grad_norm": 38.605316162109375, "learning_rate": 4.747227886019232e-05, "loss": 3.2571, "step": 251000 }, { "epoch": 0.5264678299896172, "grad_norm": 11.402087211608887, "learning_rate": 4.746703460370203e-05, "loss": 3.2573, "step": 251500 }, { "epoch": 0.5275144857152426, "grad_norm": 12.527499198913574, "learning_rate": 4.746179034721174e-05, "loss": 3.2527, "step": 252000 }, { "epoch": 0.5285611414408682, "grad_norm": 10.557287216186523, "learning_rate": 4.7456546090721445e-05, "loss": 3.2445, "step": 252500 }, { "epoch": 0.5296077971664936, "grad_norm": 13.688509941101074, "learning_rate": 4.7451301834231156e-05, "loss": 3.2645, "step": 253000 }, { "epoch": 0.5306544528921191, "grad_norm": 9.785298347473145, "learning_rate": 4.744605757774086e-05, "loss": 3.2445, "step": 253500 }, { "epoch": 0.5317011086177446, "grad_norm": 12.193055152893066, "learning_rate": 4.744081332125057e-05, "loss": 3.2369, "step": 254000 }, { "epoch": 0.53274776434337, "grad_norm": 12.82094955444336, "learning_rate": 4.7435569064760276e-05, "loss": 3.2787, "step": 254500 }, { "epoch": 0.5337944200689956, "grad_norm": 24.16873550415039, "learning_rate": 4.743032480826998e-05, "loss": 3.2332, "step": 255000 }, { "epoch": 0.534841075794621, "grad_norm": 11.065851211547852, "learning_rate": 4.742508055177969e-05, "loss": 3.2484, "step": 255500 }, { "epoch": 0.5358877315202465, "grad_norm": 14.155463218688965, "learning_rate": 4.74198362952894e-05, "loss": 3.2403, "step": 256000 }, { "epoch": 0.536934387245872, "grad_norm": 11.524399757385254, "learning_rate": 4.741459203879911e-05, "loss": 3.25, "step": 256500 }, { "epoch": 0.5379810429714975, "grad_norm": 13.587599754333496, "learning_rate": 4.740934778230882e-05, "loss": 3.2538, "step": 257000 }, { "epoch": 0.539027698697123, "grad_norm": 11.46796989440918, "learning_rate": 4.740410352581853e-05, "loss": 3.2462, "step": 257500 }, { "epoch": 0.5400743544227484, "grad_norm": 12.342914581298828, "learning_rate": 4.739885926932823e-05, "loss": 3.2426, "step": 258000 }, { "epoch": 0.5411210101483739, "grad_norm": 41.22052764892578, "learning_rate": 4.7393615012837944e-05, "loss": 3.2556, "step": 258500 }, { "epoch": 0.5421676658739994, "grad_norm": 29.183998107910156, "learning_rate": 4.738837075634765e-05, "loss": 3.2266, "step": 259000 }, { "epoch": 0.5432143215996249, "grad_norm": 14.448291778564453, "learning_rate": 4.738312649985736e-05, "loss": 3.2425, "step": 259500 }, { "epoch": 0.5442609773252504, "grad_norm": 10.818593978881836, "learning_rate": 4.7377882243367063e-05, "loss": 3.2436, "step": 260000 }, { "epoch": 0.5453076330508758, "grad_norm": 10.220304489135742, "learning_rate": 4.7372637986876775e-05, "loss": 3.2569, "step": 260500 }, { "epoch": 0.5463542887765013, "grad_norm": 16.36034393310547, "learning_rate": 4.7367393730386486e-05, "loss": 3.2266, "step": 261000 }, { "epoch": 0.5474009445021268, "grad_norm": 12.029895782470703, "learning_rate": 4.736214947389619e-05, "loss": 3.229, "step": 261500 }, { "epoch": 0.5484476002277523, "grad_norm": 10.170304298400879, "learning_rate": 4.73569052174059e-05, "loss": 3.2446, "step": 262000 }, { "epoch": 0.5494942559533778, "grad_norm": 24.072002410888672, "learning_rate": 4.7351660960915605e-05, "loss": 3.2515, "step": 262500 }, { "epoch": 0.5505409116790032, "grad_norm": 12.291316032409668, "learning_rate": 4.7346416704425316e-05, "loss": 3.2615, "step": 263000 }, { "epoch": 0.5515875674046288, "grad_norm": 18.763093948364258, "learning_rate": 4.734117244793502e-05, "loss": 3.2241, "step": 263500 }, { "epoch": 0.5526342231302542, "grad_norm": 22.916902542114258, "learning_rate": 4.733592819144473e-05, "loss": 3.2466, "step": 264000 }, { "epoch": 0.5536808788558797, "grad_norm": 12.441400527954102, "learning_rate": 4.733068393495444e-05, "loss": 3.2577, "step": 264500 }, { "epoch": 0.5547275345815051, "grad_norm": 14.343952178955078, "learning_rate": 4.7325439678464154e-05, "loss": 3.2476, "step": 265000 }, { "epoch": 0.5557741903071307, "grad_norm": 13.14834213256836, "learning_rate": 4.732019542197386e-05, "loss": 3.2173, "step": 265500 }, { "epoch": 0.5568208460327562, "grad_norm": 13.418262481689453, "learning_rate": 4.731495116548356e-05, "loss": 3.2244, "step": 266000 }, { "epoch": 0.5578675017583816, "grad_norm": 21.17820167541504, "learning_rate": 4.7309706908993274e-05, "loss": 3.2361, "step": 266500 }, { "epoch": 0.5589141574840071, "grad_norm": 18.050905227661133, "learning_rate": 4.730446265250298e-05, "loss": 3.2394, "step": 267000 }, { "epoch": 0.5599608132096325, "grad_norm": 10.767925262451172, "learning_rate": 4.729921839601269e-05, "loss": 3.2121, "step": 267500 }, { "epoch": 0.5610074689352581, "grad_norm": 14.586458206176758, "learning_rate": 4.729397413952239e-05, "loss": 3.2149, "step": 268000 }, { "epoch": 0.5620541246608836, "grad_norm": 11.322100639343262, "learning_rate": 4.7288729883032104e-05, "loss": 3.2355, "step": 268500 }, { "epoch": 0.563100780386509, "grad_norm": 10.188373565673828, "learning_rate": 4.7283485626541815e-05, "loss": 3.2441, "step": 269000 }, { "epoch": 0.5641474361121345, "grad_norm": 11.419088363647461, "learning_rate": 4.7278241370051527e-05, "loss": 3.2256, "step": 269500 }, { "epoch": 0.56519409183776, "grad_norm": 13.964337348937988, "learning_rate": 4.727299711356123e-05, "loss": 3.2316, "step": 270000 }, { "epoch": 0.5662407475633855, "grad_norm": 12.28486156463623, "learning_rate": 4.726775285707094e-05, "loss": 3.2488, "step": 270500 }, { "epoch": 0.567287403289011, "grad_norm": 10.87983226776123, "learning_rate": 4.7262508600580646e-05, "loss": 3.2057, "step": 271000 }, { "epoch": 0.5683340590146364, "grad_norm": 12.951556205749512, "learning_rate": 4.725726434409035e-05, "loss": 3.2116, "step": 271500 }, { "epoch": 0.5693807147402619, "grad_norm": 12.187336921691895, "learning_rate": 4.725202008760006e-05, "loss": 3.2319, "step": 272000 }, { "epoch": 0.5704273704658874, "grad_norm": 12.295306205749512, "learning_rate": 4.7246775831109766e-05, "loss": 3.2259, "step": 272500 }, { "epoch": 0.5714740261915129, "grad_norm": 13.083463668823242, "learning_rate": 4.724153157461948e-05, "loss": 3.2387, "step": 273000 }, { "epoch": 0.5725206819171383, "grad_norm": 10.999021530151367, "learning_rate": 4.723628731812919e-05, "loss": 3.2148, "step": 273500 }, { "epoch": 0.5735673376427638, "grad_norm": 12.617344856262207, "learning_rate": 4.72310430616389e-05, "loss": 3.2212, "step": 274000 }, { "epoch": 0.5746139933683894, "grad_norm": 10.518115043640137, "learning_rate": 4.7225798805148603e-05, "loss": 3.232, "step": 274500 }, { "epoch": 0.5756606490940148, "grad_norm": 20.443981170654297, "learning_rate": 4.7220554548658315e-05, "loss": 3.2264, "step": 275000 }, { "epoch": 0.5767073048196403, "grad_norm": 12.151309967041016, "learning_rate": 4.721531029216802e-05, "loss": 3.2019, "step": 275500 }, { "epoch": 0.5777539605452657, "grad_norm": 12.014036178588867, "learning_rate": 4.721006603567773e-05, "loss": 3.252, "step": 276000 }, { "epoch": 0.5788006162708913, "grad_norm": 11.327113151550293, "learning_rate": 4.7204821779187434e-05, "loss": 3.2144, "step": 276500 }, { "epoch": 0.5798472719965168, "grad_norm": 12.61948299407959, "learning_rate": 4.7199577522697145e-05, "loss": 3.2111, "step": 277000 }, { "epoch": 0.5808939277221422, "grad_norm": 17.71512794494629, "learning_rate": 4.719433326620685e-05, "loss": 3.223, "step": 277500 }, { "epoch": 0.5819405834477677, "grad_norm": 22.78300666809082, "learning_rate": 4.718908900971656e-05, "loss": 3.2052, "step": 278000 }, { "epoch": 0.5829872391733931, "grad_norm": 12.066803932189941, "learning_rate": 4.718384475322627e-05, "loss": 3.1965, "step": 278500 }, { "epoch": 0.5840338948990187, "grad_norm": 10.79959774017334, "learning_rate": 4.7178600496735976e-05, "loss": 3.2386, "step": 279000 }, { "epoch": 0.5850805506246441, "grad_norm": 11.833313941955566, "learning_rate": 4.717335624024569e-05, "loss": 3.2267, "step": 279500 }, { "epoch": 0.5861272063502696, "grad_norm": 10.815296173095703, "learning_rate": 4.716811198375539e-05, "loss": 3.2076, "step": 280000 }, { "epoch": 0.5871738620758951, "grad_norm": 11.643335342407227, "learning_rate": 4.71628677272651e-05, "loss": 3.2328, "step": 280500 }, { "epoch": 0.5882205178015206, "grad_norm": 13.124659538269043, "learning_rate": 4.715762347077481e-05, "loss": 3.214, "step": 281000 }, { "epoch": 0.5892671735271461, "grad_norm": 13.725311279296875, "learning_rate": 4.715237921428452e-05, "loss": 3.2119, "step": 281500 }, { "epoch": 0.5903138292527715, "grad_norm": 12.865479469299316, "learning_rate": 4.714713495779422e-05, "loss": 3.2364, "step": 282000 }, { "epoch": 0.591360484978397, "grad_norm": 10.1745023727417, "learning_rate": 4.714189070130394e-05, "loss": 3.2187, "step": 282500 }, { "epoch": 0.5924071407040226, "grad_norm": 13.098953247070312, "learning_rate": 4.7136646444813644e-05, "loss": 3.198, "step": 283000 }, { "epoch": 0.593453796429648, "grad_norm": 10.606191635131836, "learning_rate": 4.713140218832335e-05, "loss": 3.2083, "step": 283500 }, { "epoch": 0.5945004521552735, "grad_norm": 12.455157279968262, "learning_rate": 4.712615793183306e-05, "loss": 3.1976, "step": 284000 }, { "epoch": 0.5955471078808989, "grad_norm": 18.415294647216797, "learning_rate": 4.7120913675342764e-05, "loss": 3.1948, "step": 284500 }, { "epoch": 0.5965937636065244, "grad_norm": 14.928581237792969, "learning_rate": 4.7115669418852475e-05, "loss": 3.2003, "step": 285000 }, { "epoch": 0.59764041933215, "grad_norm": 13.106548309326172, "learning_rate": 4.711042516236218e-05, "loss": 3.2105, "step": 285500 }, { "epoch": 0.5986870750577754, "grad_norm": 12.409499168395996, "learning_rate": 4.710518090587189e-05, "loss": 3.2117, "step": 286000 }, { "epoch": 0.5997337307834009, "grad_norm": 11.001574516296387, "learning_rate": 4.70999366493816e-05, "loss": 3.2168, "step": 286500 }, { "epoch": 0.6007803865090263, "grad_norm": 15.624911308288574, "learning_rate": 4.709469239289131e-05, "loss": 3.1982, "step": 287000 }, { "epoch": 0.6018270422346519, "grad_norm": 10.523412704467773, "learning_rate": 4.708944813640102e-05, "loss": 3.1894, "step": 287500 }, { "epoch": 0.6028736979602773, "grad_norm": 13.634712219238281, "learning_rate": 4.708420387991073e-05, "loss": 3.2218, "step": 288000 }, { "epoch": 0.6039203536859028, "grad_norm": 10.445843696594238, "learning_rate": 4.707895962342043e-05, "loss": 3.2118, "step": 288500 }, { "epoch": 0.6049670094115283, "grad_norm": 11.101638793945312, "learning_rate": 4.707371536693014e-05, "loss": 3.1995, "step": 289000 }, { "epoch": 0.6060136651371537, "grad_norm": 12.887903213500977, "learning_rate": 4.706847111043985e-05, "loss": 3.2087, "step": 289500 }, { "epoch": 0.6070603208627793, "grad_norm": 11.723923683166504, "learning_rate": 4.706322685394955e-05, "loss": 3.2198, "step": 290000 }, { "epoch": 0.6081069765884047, "grad_norm": 12.099041938781738, "learning_rate": 4.705798259745926e-05, "loss": 3.2031, "step": 290500 }, { "epoch": 0.6091536323140302, "grad_norm": 11.49376106262207, "learning_rate": 4.7052738340968974e-05, "loss": 3.2053, "step": 291000 }, { "epoch": 0.6102002880396556, "grad_norm": 14.615431785583496, "learning_rate": 4.7047494084478685e-05, "loss": 3.2099, "step": 291500 }, { "epoch": 0.6112469437652812, "grad_norm": 12.73239803314209, "learning_rate": 4.704224982798839e-05, "loss": 3.2089, "step": 292000 }, { "epoch": 0.6122935994909067, "grad_norm": 12.263916969299316, "learning_rate": 4.70370055714981e-05, "loss": 3.2071, "step": 292500 }, { "epoch": 0.6133402552165321, "grad_norm": 9.607510566711426, "learning_rate": 4.7031761315007805e-05, "loss": 3.1985, "step": 293000 }, { "epoch": 0.6143869109421576, "grad_norm": 13.682400703430176, "learning_rate": 4.7026517058517516e-05, "loss": 3.2032, "step": 293500 }, { "epoch": 0.6154335666677831, "grad_norm": 10.591018676757812, "learning_rate": 4.702127280202722e-05, "loss": 3.197, "step": 294000 }, { "epoch": 0.6164802223934086, "grad_norm": 11.576348304748535, "learning_rate": 4.7016028545536925e-05, "loss": 3.1916, "step": 294500 }, { "epoch": 0.6175268781190341, "grad_norm": 10.084219932556152, "learning_rate": 4.7010784289046636e-05, "loss": 3.1907, "step": 295000 }, { "epoch": 0.6185735338446595, "grad_norm": 10.628474235534668, "learning_rate": 4.700554003255635e-05, "loss": 3.1665, "step": 295500 }, { "epoch": 0.619620189570285, "grad_norm": 12.60759449005127, "learning_rate": 4.700029577606606e-05, "loss": 3.1841, "step": 296000 }, { "epoch": 0.6206668452959105, "grad_norm": 12.369193077087402, "learning_rate": 4.699505151957576e-05, "loss": 3.2088, "step": 296500 }, { "epoch": 0.621713501021536, "grad_norm": 11.566487312316895, "learning_rate": 4.698980726308547e-05, "loss": 3.1792, "step": 297000 }, { "epoch": 0.6227601567471615, "grad_norm": 12.377405166625977, "learning_rate": 4.698456300659518e-05, "loss": 3.1977, "step": 297500 }, { "epoch": 0.6238068124727869, "grad_norm": 11.697103500366211, "learning_rate": 4.697931875010489e-05, "loss": 3.1707, "step": 298000 }, { "epoch": 0.6248534681984125, "grad_norm": 13.635916709899902, "learning_rate": 4.697407449361459e-05, "loss": 3.1918, "step": 298500 }, { "epoch": 0.6259001239240379, "grad_norm": 16.15915298461914, "learning_rate": 4.6968830237124304e-05, "loss": 3.1745, "step": 299000 }, { "epoch": 0.6269467796496634, "grad_norm": 12.154026985168457, "learning_rate": 4.696358598063401e-05, "loss": 3.1796, "step": 299500 }, { "epoch": 0.6279934353752888, "grad_norm": 12.658409118652344, "learning_rate": 4.695834172414372e-05, "loss": 3.2055, "step": 300000 }, { "epoch": 0.6290400911009144, "grad_norm": 12.623157501220703, "learning_rate": 4.695309746765343e-05, "loss": 3.1784, "step": 300500 }, { "epoch": 0.6300867468265399, "grad_norm": 11.981449127197266, "learning_rate": 4.6947853211163135e-05, "loss": 3.2058, "step": 301000 }, { "epoch": 0.6311334025521653, "grad_norm": 11.592101097106934, "learning_rate": 4.6942608954672846e-05, "loss": 3.1715, "step": 301500 }, { "epoch": 0.6321800582777908, "grad_norm": 12.896431922912598, "learning_rate": 4.693736469818255e-05, "loss": 3.1845, "step": 302000 }, { "epoch": 0.6332267140034162, "grad_norm": 12.116127014160156, "learning_rate": 4.693212044169226e-05, "loss": 3.1858, "step": 302500 }, { "epoch": 0.6342733697290418, "grad_norm": 11.120115280151367, "learning_rate": 4.6926876185201966e-05, "loss": 3.1757, "step": 303000 }, { "epoch": 0.6353200254546673, "grad_norm": 11.571722030639648, "learning_rate": 4.692163192871168e-05, "loss": 3.1914, "step": 303500 }, { "epoch": 0.6363666811802927, "grad_norm": 12.708115577697754, "learning_rate": 4.691638767222139e-05, "loss": 3.182, "step": 304000 }, { "epoch": 0.6374133369059182, "grad_norm": 11.867795944213867, "learning_rate": 4.69111434157311e-05, "loss": 3.1847, "step": 304500 }, { "epoch": 0.6384599926315437, "grad_norm": 12.01040267944336, "learning_rate": 4.69058991592408e-05, "loss": 3.184, "step": 305000 }, { "epoch": 0.6395066483571692, "grad_norm": 11.630237579345703, "learning_rate": 4.6900654902750514e-05, "loss": 3.1598, "step": 305500 }, { "epoch": 0.6405533040827947, "grad_norm": 20.760202407836914, "learning_rate": 4.689541064626022e-05, "loss": 3.1755, "step": 306000 }, { "epoch": 0.6415999598084201, "grad_norm": 13.840978622436523, "learning_rate": 4.689016638976992e-05, "loss": 3.1848, "step": 306500 }, { "epoch": 0.6426466155340456, "grad_norm": 11.745548248291016, "learning_rate": 4.6884922133279634e-05, "loss": 3.1668, "step": 307000 }, { "epoch": 0.6436932712596711, "grad_norm": 13.243752479553223, "learning_rate": 4.687967787678934e-05, "loss": 3.1663, "step": 307500 }, { "epoch": 0.6447399269852966, "grad_norm": 12.510363578796387, "learning_rate": 4.687443362029905e-05, "loss": 3.1697, "step": 308000 }, { "epoch": 0.645786582710922, "grad_norm": 18.612564086914062, "learning_rate": 4.686918936380876e-05, "loss": 3.1799, "step": 308500 }, { "epoch": 0.6468332384365475, "grad_norm": 13.028151512145996, "learning_rate": 4.686394510731847e-05, "loss": 3.1589, "step": 309000 }, { "epoch": 0.6478798941621731, "grad_norm": 11.039491653442383, "learning_rate": 4.6858700850828176e-05, "loss": 3.1795, "step": 309500 }, { "epoch": 0.6489265498877985, "grad_norm": 12.055290222167969, "learning_rate": 4.685345659433789e-05, "loss": 3.1635, "step": 310000 }, { "epoch": 0.649973205613424, "grad_norm": 10.920814514160156, "learning_rate": 4.684821233784759e-05, "loss": 3.1878, "step": 310500 }, { "epoch": 0.6510198613390494, "grad_norm": 13.798356056213379, "learning_rate": 4.68429680813573e-05, "loss": 3.1664, "step": 311000 }, { "epoch": 0.652066517064675, "grad_norm": 13.495950698852539, "learning_rate": 4.6837723824867007e-05, "loss": 3.1788, "step": 311500 }, { "epoch": 0.6531131727903005, "grad_norm": 12.02611255645752, "learning_rate": 4.683247956837671e-05, "loss": 3.1691, "step": 312000 }, { "epoch": 0.6541598285159259, "grad_norm": 11.283269882202148, "learning_rate": 4.682723531188642e-05, "loss": 3.1706, "step": 312500 }, { "epoch": 0.6552064842415514, "grad_norm": 13.125205039978027, "learning_rate": 4.682199105539613e-05, "loss": 3.1607, "step": 313000 }, { "epoch": 0.6562531399671768, "grad_norm": 12.36419677734375, "learning_rate": 4.6816746798905844e-05, "loss": 3.1728, "step": 313500 }, { "epoch": 0.6572997956928024, "grad_norm": 12.0266752243042, "learning_rate": 4.681150254241555e-05, "loss": 3.1943, "step": 314000 }, { "epoch": 0.6583464514184278, "grad_norm": 14.401873588562012, "learning_rate": 4.680625828592526e-05, "loss": 3.1618, "step": 314500 }, { "epoch": 0.6593931071440533, "grad_norm": 12.533392906188965, "learning_rate": 4.6801014029434964e-05, "loss": 3.1547, "step": 315000 }, { "epoch": 0.6604397628696788, "grad_norm": 13.615594863891602, "learning_rate": 4.6795769772944675e-05, "loss": 3.173, "step": 315500 }, { "epoch": 0.6614864185953043, "grad_norm": 19.541793823242188, "learning_rate": 4.679052551645438e-05, "loss": 3.1642, "step": 316000 }, { "epoch": 0.6625330743209298, "grad_norm": 10.553787231445312, "learning_rate": 4.678528125996409e-05, "loss": 3.1729, "step": 316500 }, { "epoch": 0.6635797300465552, "grad_norm": 12.00371265411377, "learning_rate": 4.6780037003473795e-05, "loss": 3.174, "step": 317000 }, { "epoch": 0.6646263857721807, "grad_norm": 12.444520950317383, "learning_rate": 4.6774792746983506e-05, "loss": 3.1714, "step": 317500 }, { "epoch": 0.6656730414978063, "grad_norm": 11.990275382995605, "learning_rate": 4.676954849049322e-05, "loss": 3.1784, "step": 318000 }, { "epoch": 0.6667196972234317, "grad_norm": 13.660876274108887, "learning_rate": 4.676430423400292e-05, "loss": 3.1388, "step": 318500 }, { "epoch": 0.6677663529490572, "grad_norm": 18.314346313476562, "learning_rate": 4.675905997751263e-05, "loss": 3.1664, "step": 319000 }, { "epoch": 0.6688130086746826, "grad_norm": 13.319297790527344, "learning_rate": 4.6753815721022336e-05, "loss": 3.1389, "step": 319500 }, { "epoch": 0.6698596644003081, "grad_norm": 14.355192184448242, "learning_rate": 4.674857146453205e-05, "loss": 3.1528, "step": 320000 }, { "epoch": 0.6709063201259337, "grad_norm": 12.027448654174805, "learning_rate": 4.674332720804175e-05, "loss": 3.1838, "step": 320500 }, { "epoch": 0.6719529758515591, "grad_norm": 11.1917085647583, "learning_rate": 4.673808295155146e-05, "loss": 3.1619, "step": 321000 }, { "epoch": 0.6729996315771846, "grad_norm": 11.791175842285156, "learning_rate": 4.6732838695061174e-05, "loss": 3.1332, "step": 321500 }, { "epoch": 0.67404628730281, "grad_norm": 10.749753952026367, "learning_rate": 4.6727594438570885e-05, "loss": 3.1667, "step": 322000 }, { "epoch": 0.6750929430284356, "grad_norm": 14.90285873413086, "learning_rate": 4.672235018208059e-05, "loss": 3.1548, "step": 322500 }, { "epoch": 0.676139598754061, "grad_norm": 11.376599311828613, "learning_rate": 4.6717105925590294e-05, "loss": 3.1512, "step": 323000 }, { "epoch": 0.6771862544796865, "grad_norm": 12.171027183532715, "learning_rate": 4.6711861669100005e-05, "loss": 3.1579, "step": 323500 }, { "epoch": 0.678232910205312, "grad_norm": 11.9298734664917, "learning_rate": 4.670661741260971e-05, "loss": 3.156, "step": 324000 }, { "epoch": 0.6792795659309375, "grad_norm": 17.16067123413086, "learning_rate": 4.670137315611942e-05, "loss": 3.152, "step": 324500 }, { "epoch": 0.680326221656563, "grad_norm": 12.69084644317627, "learning_rate": 4.6696128899629124e-05, "loss": 3.1641, "step": 325000 }, { "epoch": 0.6813728773821884, "grad_norm": 13.096558570861816, "learning_rate": 4.6690884643138835e-05, "loss": 3.1452, "step": 325500 }, { "epoch": 0.6824195331078139, "grad_norm": 12.769599914550781, "learning_rate": 4.6685640386648546e-05, "loss": 3.1438, "step": 326000 }, { "epoch": 0.6834661888334393, "grad_norm": 13.49288272857666, "learning_rate": 4.668039613015826e-05, "loss": 3.1699, "step": 326500 }, { "epoch": 0.6845128445590649, "grad_norm": 12.48360824584961, "learning_rate": 4.667515187366796e-05, "loss": 3.1594, "step": 327000 }, { "epoch": 0.6855595002846904, "grad_norm": 11.576814651489258, "learning_rate": 4.666990761717767e-05, "loss": 3.1533, "step": 327500 }, { "epoch": 0.6866061560103158, "grad_norm": 10.690085411071777, "learning_rate": 4.666466336068738e-05, "loss": 3.1505, "step": 328000 }, { "epoch": 0.6876528117359413, "grad_norm": 10.682583808898926, "learning_rate": 4.665941910419708e-05, "loss": 3.1568, "step": 328500 }, { "epoch": 0.6886994674615668, "grad_norm": 11.342494010925293, "learning_rate": 4.665417484770679e-05, "loss": 3.1389, "step": 329000 }, { "epoch": 0.6897461231871923, "grad_norm": 11.950178146362305, "learning_rate": 4.66489305912165e-05, "loss": 3.1455, "step": 329500 }, { "epoch": 0.6907927789128178, "grad_norm": 11.214993476867676, "learning_rate": 4.664368633472621e-05, "loss": 3.137, "step": 330000 }, { "epoch": 0.6918394346384432, "grad_norm": 11.733528137207031, "learning_rate": 4.663844207823592e-05, "loss": 3.1268, "step": 330500 }, { "epoch": 0.6928860903640687, "grad_norm": 10.676284790039062, "learning_rate": 4.663319782174563e-05, "loss": 3.1541, "step": 331000 }, { "epoch": 0.6939327460896942, "grad_norm": 10.14946174621582, "learning_rate": 4.6627953565255334e-05, "loss": 3.1249, "step": 331500 }, { "epoch": 0.6949794018153197, "grad_norm": 11.131180763244629, "learning_rate": 4.6622709308765046e-05, "loss": 3.1485, "step": 332000 }, { "epoch": 0.6960260575409452, "grad_norm": 9.187304496765137, "learning_rate": 4.661746505227475e-05, "loss": 3.1436, "step": 332500 }, { "epoch": 0.6970727132665706, "grad_norm": 12.483375549316406, "learning_rate": 4.661222079578446e-05, "loss": 3.143, "step": 333000 }, { "epoch": 0.6981193689921962, "grad_norm": 10.965605735778809, "learning_rate": 4.6606976539294165e-05, "loss": 3.133, "step": 333500 }, { "epoch": 0.6991660247178216, "grad_norm": 13.537543296813965, "learning_rate": 4.660173228280387e-05, "loss": 3.1432, "step": 334000 }, { "epoch": 0.7002126804434471, "grad_norm": 11.379817008972168, "learning_rate": 4.659648802631358e-05, "loss": 3.125, "step": 334500 }, { "epoch": 0.7012593361690725, "grad_norm": 11.519631385803223, "learning_rate": 4.659124376982329e-05, "loss": 3.1266, "step": 335000 }, { "epoch": 0.7023059918946981, "grad_norm": 11.667373657226562, "learning_rate": 4.6585999513333e-05, "loss": 3.1314, "step": 335500 }, { "epoch": 0.7033526476203236, "grad_norm": 12.422318458557129, "learning_rate": 4.658075525684271e-05, "loss": 3.158, "step": 336000 }, { "epoch": 0.704399303345949, "grad_norm": 15.955353736877441, "learning_rate": 4.657551100035242e-05, "loss": 3.1307, "step": 336500 }, { "epoch": 0.7054459590715745, "grad_norm": 17.625547409057617, "learning_rate": 4.657026674386212e-05, "loss": 3.1643, "step": 337000 }, { "epoch": 0.7064926147971999, "grad_norm": 15.156152725219727, "learning_rate": 4.6565022487371834e-05, "loss": 3.141, "step": 337500 }, { "epoch": 0.7075392705228255, "grad_norm": 13.791218757629395, "learning_rate": 4.655977823088154e-05, "loss": 3.1524, "step": 338000 }, { "epoch": 0.708585926248451, "grad_norm": 15.028203964233398, "learning_rate": 4.655453397439125e-05, "loss": 3.1182, "step": 338500 }, { "epoch": 0.7096325819740764, "grad_norm": 19.555953979492188, "learning_rate": 4.654928971790096e-05, "loss": 3.1358, "step": 339000 }, { "epoch": 0.7106792376997019, "grad_norm": 26.39183235168457, "learning_rate": 4.654404546141067e-05, "loss": 3.1278, "step": 339500 }, { "epoch": 0.7117258934253274, "grad_norm": 13.177945137023926, "learning_rate": 4.6538801204920375e-05, "loss": 3.1283, "step": 340000 }, { "epoch": 0.7127725491509529, "grad_norm": 11.65290355682373, "learning_rate": 4.653355694843008e-05, "loss": 3.1264, "step": 340500 }, { "epoch": 0.7138192048765784, "grad_norm": 13.292521476745605, "learning_rate": 4.652831269193979e-05, "loss": 3.1417, "step": 341000 }, { "epoch": 0.7148658606022038, "grad_norm": 12.040106773376465, "learning_rate": 4.6523068435449495e-05, "loss": 3.1519, "step": 341500 }, { "epoch": 0.7159125163278294, "grad_norm": 23.187206268310547, "learning_rate": 4.6517824178959206e-05, "loss": 3.1314, "step": 342000 }, { "epoch": 0.7169591720534548, "grad_norm": 12.262561798095703, "learning_rate": 4.651257992246891e-05, "loss": 3.1447, "step": 342500 }, { "epoch": 0.7180058277790803, "grad_norm": 11.518682479858398, "learning_rate": 4.650733566597862e-05, "loss": 3.1214, "step": 343000 }, { "epoch": 0.7190524835047057, "grad_norm": 25.714855194091797, "learning_rate": 4.650209140948833e-05, "loss": 3.1381, "step": 343500 }, { "epoch": 0.7200991392303312, "grad_norm": 37.9433479309082, "learning_rate": 4.6496847152998044e-05, "loss": 3.0885, "step": 344000 }, { "epoch": 0.7211457949559568, "grad_norm": 13.00617504119873, "learning_rate": 4.649160289650775e-05, "loss": 3.1218, "step": 344500 }, { "epoch": 0.7221924506815822, "grad_norm": 15.292563438415527, "learning_rate": 4.648635864001746e-05, "loss": 3.1475, "step": 345000 }, { "epoch": 0.7232391064072077, "grad_norm": 11.452534675598145, "learning_rate": 4.6481114383527163e-05, "loss": 3.1412, "step": 345500 }, { "epoch": 0.7242857621328331, "grad_norm": 12.143768310546875, "learning_rate": 4.647587012703687e-05, "loss": 3.1442, "step": 346000 }, { "epoch": 0.7253324178584587, "grad_norm": 13.421159744262695, "learning_rate": 4.647062587054658e-05, "loss": 3.1469, "step": 346500 }, { "epoch": 0.7263790735840842, "grad_norm": 12.185373306274414, "learning_rate": 4.646538161405628e-05, "loss": 3.1505, "step": 347000 }, { "epoch": 0.7274257293097096, "grad_norm": 13.826848983764648, "learning_rate": 4.6460137357565994e-05, "loss": 3.1322, "step": 347500 }, { "epoch": 0.7284723850353351, "grad_norm": 45.465518951416016, "learning_rate": 4.6454893101075705e-05, "loss": 3.1139, "step": 348000 }, { "epoch": 0.7295190407609605, "grad_norm": 11.254410743713379, "learning_rate": 4.6449648844585416e-05, "loss": 3.1349, "step": 348500 }, { "epoch": 0.7305656964865861, "grad_norm": 15.092592239379883, "learning_rate": 4.644440458809512e-05, "loss": 3.1169, "step": 349000 }, { "epoch": 0.7316123522122115, "grad_norm": 16.558916091918945, "learning_rate": 4.643916033160483e-05, "loss": 3.1211, "step": 349500 }, { "epoch": 0.732659007937837, "grad_norm": 16.054168701171875, "learning_rate": 4.6433916075114536e-05, "loss": 3.1329, "step": 350000 }, { "epoch": 0.7337056636634625, "grad_norm": 12.72154712677002, "learning_rate": 4.642867181862425e-05, "loss": 3.1356, "step": 350500 }, { "epoch": 0.734752319389088, "grad_norm": 12.969910621643066, "learning_rate": 4.642342756213395e-05, "loss": 3.1344, "step": 351000 }, { "epoch": 0.7357989751147135, "grad_norm": 16.4075870513916, "learning_rate": 4.6418183305643656e-05, "loss": 3.1261, "step": 351500 }, { "epoch": 0.7368456308403389, "grad_norm": 12.264347076416016, "learning_rate": 4.641293904915337e-05, "loss": 3.163, "step": 352000 }, { "epoch": 0.7378922865659644, "grad_norm": 14.247276306152344, "learning_rate": 4.640769479266308e-05, "loss": 3.1124, "step": 352500 }, { "epoch": 0.73893894229159, "grad_norm": 13.081042289733887, "learning_rate": 4.640245053617279e-05, "loss": 3.1414, "step": 353000 }, { "epoch": 0.7399855980172154, "grad_norm": 14.155680656433105, "learning_rate": 4.639720627968249e-05, "loss": 3.1189, "step": 353500 }, { "epoch": 0.7410322537428409, "grad_norm": 15.335273742675781, "learning_rate": 4.6391962023192204e-05, "loss": 3.1343, "step": 354000 }, { "epoch": 0.7420789094684663, "grad_norm": 13.121475219726562, "learning_rate": 4.638671776670191e-05, "loss": 3.1495, "step": 354500 }, { "epoch": 0.7431255651940918, "grad_norm": 14.866905212402344, "learning_rate": 4.638147351021162e-05, "loss": 3.1172, "step": 355000 }, { "epoch": 0.7441722209197174, "grad_norm": 16.041889190673828, "learning_rate": 4.6376229253721324e-05, "loss": 3.1159, "step": 355500 }, { "epoch": 0.7452188766453428, "grad_norm": 10.492414474487305, "learning_rate": 4.6370984997231035e-05, "loss": 3.1217, "step": 356000 }, { "epoch": 0.7462655323709683, "grad_norm": 19.068044662475586, "learning_rate": 4.6365740740740746e-05, "loss": 3.1634, "step": 356500 }, { "epoch": 0.7473121880965937, "grad_norm": 18.703811645507812, "learning_rate": 4.636049648425045e-05, "loss": 3.1154, "step": 357000 }, { "epoch": 0.7483588438222193, "grad_norm": 11.939187049865723, "learning_rate": 4.635525222776016e-05, "loss": 3.1222, "step": 357500 }, { "epoch": 0.7494054995478447, "grad_norm": 27.599912643432617, "learning_rate": 4.6350007971269866e-05, "loss": 3.1312, "step": 358000 }, { "epoch": 0.7504521552734702, "grad_norm": 31.859609603881836, "learning_rate": 4.634476371477958e-05, "loss": 3.1082, "step": 358500 }, { "epoch": 0.7514988109990957, "grad_norm": 18.116708755493164, "learning_rate": 4.633951945828928e-05, "loss": 3.1232, "step": 359000 }, { "epoch": 0.7525454667247212, "grad_norm": 19.195653915405273, "learning_rate": 4.633427520179899e-05, "loss": 3.113, "step": 359500 }, { "epoch": 0.7535921224503467, "grad_norm": 13.86038875579834, "learning_rate": 4.6329030945308697e-05, "loss": 3.1255, "step": 360000 }, { "epoch": 0.7546387781759721, "grad_norm": 45.58404541015625, "learning_rate": 4.632378668881841e-05, "loss": 3.1185, "step": 360500 }, { "epoch": 0.7556854339015976, "grad_norm": 15.275707244873047, "learning_rate": 4.631854243232812e-05, "loss": 3.0966, "step": 361000 }, { "epoch": 0.756732089627223, "grad_norm": 21.946880340576172, "learning_rate": 4.631329817583783e-05, "loss": 3.1057, "step": 361500 }, { "epoch": 0.7577787453528486, "grad_norm": 13.278172492980957, "learning_rate": 4.6308053919347534e-05, "loss": 3.1117, "step": 362000 }, { "epoch": 0.7588254010784741, "grad_norm": 12.617074012756348, "learning_rate": 4.630280966285724e-05, "loss": 3.1164, "step": 362500 }, { "epoch": 0.7598720568040995, "grad_norm": 15.399270057678223, "learning_rate": 4.629756540636695e-05, "loss": 3.121, "step": 363000 }, { "epoch": 0.760918712529725, "grad_norm": 19.072643280029297, "learning_rate": 4.6292321149876654e-05, "loss": 3.1111, "step": 363500 }, { "epoch": 0.7619653682553506, "grad_norm": 17.614280700683594, "learning_rate": 4.6287076893386365e-05, "loss": 3.1205, "step": 364000 }, { "epoch": 0.763012023980976, "grad_norm": 17.10926628112793, "learning_rate": 4.628183263689607e-05, "loss": 3.114, "step": 364500 }, { "epoch": 0.7640586797066015, "grad_norm": 11.822906494140625, "learning_rate": 4.627658838040578e-05, "loss": 3.1439, "step": 365000 }, { "epoch": 0.7651053354322269, "grad_norm": 30.336244583129883, "learning_rate": 4.627134412391549e-05, "loss": 3.1377, "step": 365500 }, { "epoch": 0.7661519911578524, "grad_norm": 11.75993824005127, "learning_rate": 4.62660998674252e-05, "loss": 3.1198, "step": 366000 }, { "epoch": 0.7671986468834779, "grad_norm": 51.73609161376953, "learning_rate": 4.626085561093491e-05, "loss": 3.1104, "step": 366500 }, { "epoch": 0.7682453026091034, "grad_norm": 15.445672988891602, "learning_rate": 4.625561135444462e-05, "loss": 3.0983, "step": 367000 }, { "epoch": 0.7692919583347289, "grad_norm": 12.662696838378906, "learning_rate": 4.625036709795432e-05, "loss": 3.1099, "step": 367500 }, { "epoch": 0.7703386140603543, "grad_norm": 11.818068504333496, "learning_rate": 4.6245122841464026e-05, "loss": 3.1119, "step": 368000 }, { "epoch": 0.7713852697859799, "grad_norm": 14.709866523742676, "learning_rate": 4.623987858497374e-05, "loss": 3.1241, "step": 368500 }, { "epoch": 0.7724319255116053, "grad_norm": 14.78165340423584, "learning_rate": 4.623463432848344e-05, "loss": 3.1216, "step": 369000 }, { "epoch": 0.7734785812372308, "grad_norm": 14.812651634216309, "learning_rate": 4.622939007199315e-05, "loss": 3.1132, "step": 369500 }, { "epoch": 0.7745252369628562, "grad_norm": 25.571758270263672, "learning_rate": 4.6224145815502864e-05, "loss": 3.1178, "step": 370000 }, { "epoch": 0.7755718926884818, "grad_norm": 16.22130012512207, "learning_rate": 4.6218901559012575e-05, "loss": 3.1208, "step": 370500 }, { "epoch": 0.7766185484141073, "grad_norm": 17.52582359313965, "learning_rate": 4.621365730252228e-05, "loss": 3.1183, "step": 371000 }, { "epoch": 0.7776652041397327, "grad_norm": 15.591412544250488, "learning_rate": 4.620841304603199e-05, "loss": 3.1035, "step": 371500 }, { "epoch": 0.7787118598653582, "grad_norm": 69.97331237792969, "learning_rate": 4.6203168789541695e-05, "loss": 3.1061, "step": 372000 }, { "epoch": 0.7797585155909836, "grad_norm": 20.953113555908203, "learning_rate": 4.6197924533051406e-05, "loss": 3.1078, "step": 372500 }, { "epoch": 0.7808051713166092, "grad_norm": 17.598583221435547, "learning_rate": 4.619268027656111e-05, "loss": 3.1304, "step": 373000 }, { "epoch": 0.7818518270422347, "grad_norm": 17.92699432373047, "learning_rate": 4.618743602007082e-05, "loss": 3.1133, "step": 373500 }, { "epoch": 0.7828984827678601, "grad_norm": 11.71329402923584, "learning_rate": 4.618219176358053e-05, "loss": 3.1303, "step": 374000 }, { "epoch": 0.7839451384934856, "grad_norm": 14.11148452758789, "learning_rate": 4.6176947507090237e-05, "loss": 3.1337, "step": 374500 }, { "epoch": 0.7849917942191111, "grad_norm": 13.23656177520752, "learning_rate": 4.617170325059995e-05, "loss": 3.1153, "step": 375000 }, { "epoch": 0.7860384499447366, "grad_norm": 25.251689910888672, "learning_rate": 4.616645899410965e-05, "loss": 3.0952, "step": 375500 }, { "epoch": 0.787085105670362, "grad_norm": 13.39192008972168, "learning_rate": 4.616121473761936e-05, "loss": 3.1113, "step": 376000 }, { "epoch": 0.7881317613959875, "grad_norm": 16.755403518676758, "learning_rate": 4.615597048112907e-05, "loss": 3.1224, "step": 376500 }, { "epoch": 0.7891784171216131, "grad_norm": 27.71371841430664, "learning_rate": 4.615072622463878e-05, "loss": 3.1259, "step": 377000 }, { "epoch": 0.7902250728472385, "grad_norm": 16.039310455322266, "learning_rate": 4.614548196814848e-05, "loss": 3.1306, "step": 377500 }, { "epoch": 0.791271728572864, "grad_norm": 11.861383438110352, "learning_rate": 4.6140237711658194e-05, "loss": 3.1241, "step": 378000 }, { "epoch": 0.7923183842984894, "grad_norm": 18.04317283630371, "learning_rate": 4.6134993455167905e-05, "loss": 3.1095, "step": 378500 }, { "epoch": 0.7933650400241149, "grad_norm": 12.305861473083496, "learning_rate": 4.6129749198677616e-05, "loss": 3.101, "step": 379000 }, { "epoch": 0.7944116957497405, "grad_norm": 17.65093421936035, "learning_rate": 4.612450494218732e-05, "loss": 3.0948, "step": 379500 }, { "epoch": 0.7954583514753659, "grad_norm": 12.631985664367676, "learning_rate": 4.6119260685697025e-05, "loss": 3.1176, "step": 380000 }, { "epoch": 0.7965050072009914, "grad_norm": 65.73524475097656, "learning_rate": 4.6114016429206736e-05, "loss": 3.1013, "step": 380500 }, { "epoch": 0.7975516629266168, "grad_norm": 97.567626953125, "learning_rate": 4.610877217271644e-05, "loss": 3.1127, "step": 381000 }, { "epoch": 0.7985983186522424, "grad_norm": 14.298874855041504, "learning_rate": 4.610352791622615e-05, "loss": 3.1111, "step": 381500 }, { "epoch": 0.7996449743778679, "grad_norm": 20.33808708190918, "learning_rate": 4.6098283659735855e-05, "loss": 3.0829, "step": 382000 }, { "epoch": 0.8006916301034933, "grad_norm": 14.380199432373047, "learning_rate": 4.6093039403245566e-05, "loss": 3.0893, "step": 382500 }, { "epoch": 0.8017382858291188, "grad_norm": 33.72682571411133, "learning_rate": 4.608779514675528e-05, "loss": 3.1149, "step": 383000 }, { "epoch": 0.8027849415547443, "grad_norm": 33.53144836425781, "learning_rate": 4.608255089026499e-05, "loss": 3.0981, "step": 383500 }, { "epoch": 0.8038315972803698, "grad_norm": 19.080598831176758, "learning_rate": 4.607730663377469e-05, "loss": 3.1149, "step": 384000 }, { "epoch": 0.8048782530059952, "grad_norm": 26.272058486938477, "learning_rate": 4.6072062377284404e-05, "loss": 3.0959, "step": 384500 }, { "epoch": 0.8059249087316207, "grad_norm": 18.81403350830078, "learning_rate": 4.606681812079411e-05, "loss": 3.1308, "step": 385000 }, { "epoch": 0.8069715644572462, "grad_norm": 34.49949264526367, "learning_rate": 4.606157386430381e-05, "loss": 3.1122, "step": 385500 }, { "epoch": 0.8080182201828717, "grad_norm": 12.258159637451172, "learning_rate": 4.6056329607813524e-05, "loss": 3.1076, "step": 386000 }, { "epoch": 0.8090648759084972, "grad_norm": 20.330074310302734, "learning_rate": 4.605108535132323e-05, "loss": 3.1078, "step": 386500 }, { "epoch": 0.8101115316341226, "grad_norm": 17.290864944458008, "learning_rate": 4.604584109483294e-05, "loss": 3.1231, "step": 387000 }, { "epoch": 0.8111581873597481, "grad_norm": 12.34325885772705, "learning_rate": 4.604059683834265e-05, "loss": 3.0984, "step": 387500 }, { "epoch": 0.8122048430853737, "grad_norm": 14.509262084960938, "learning_rate": 4.603535258185236e-05, "loss": 3.1068, "step": 388000 }, { "epoch": 0.8132514988109991, "grad_norm": 26.04011344909668, "learning_rate": 4.6030108325362065e-05, "loss": 3.1188, "step": 388500 }, { "epoch": 0.8142981545366246, "grad_norm": 52.44916534423828, "learning_rate": 4.6024864068871777e-05, "loss": 3.0933, "step": 389000 }, { "epoch": 0.81534481026225, "grad_norm": 16.695999145507812, "learning_rate": 4.601961981238148e-05, "loss": 3.1149, "step": 389500 }, { "epoch": 0.8163914659878755, "grad_norm": 14.324834823608398, "learning_rate": 4.601437555589119e-05, "loss": 3.0898, "step": 390000 }, { "epoch": 0.817438121713501, "grad_norm": 48.99614334106445, "learning_rate": 4.6009131299400896e-05, "loss": 3.0998, "step": 390500 }, { "epoch": 0.8184847774391265, "grad_norm": 33.5034065246582, "learning_rate": 4.60038870429106e-05, "loss": 3.0947, "step": 391000 }, { "epoch": 0.819531433164752, "grad_norm": 14.872162818908691, "learning_rate": 4.599864278642032e-05, "loss": 3.1248, "step": 391500 }, { "epoch": 0.8205780888903774, "grad_norm": 12.322943687438965, "learning_rate": 4.599339852993002e-05, "loss": 3.1169, "step": 392000 }, { "epoch": 0.821624744616003, "grad_norm": 17.79231071472168, "learning_rate": 4.5988154273439734e-05, "loss": 3.0981, "step": 392500 }, { "epoch": 0.8226714003416284, "grad_norm": 11.086318016052246, "learning_rate": 4.598291001694944e-05, "loss": 3.1116, "step": 393000 }, { "epoch": 0.8237180560672539, "grad_norm": 22.624835968017578, "learning_rate": 4.597766576045915e-05, "loss": 3.1169, "step": 393500 }, { "epoch": 0.8247647117928794, "grad_norm": 18.318923950195312, "learning_rate": 4.5972421503968853e-05, "loss": 3.1063, "step": 394000 }, { "epoch": 0.8258113675185049, "grad_norm": 22.03447914123535, "learning_rate": 4.5967177247478565e-05, "loss": 3.1279, "step": 394500 }, { "epoch": 0.8268580232441304, "grad_norm": 21.784147262573242, "learning_rate": 4.596193299098827e-05, "loss": 3.1237, "step": 395000 }, { "epoch": 0.8279046789697558, "grad_norm": 21.37521743774414, "learning_rate": 4.595668873449798e-05, "loss": 3.1047, "step": 395500 }, { "epoch": 0.8289513346953813, "grad_norm": 39.6719856262207, "learning_rate": 4.595144447800769e-05, "loss": 3.0993, "step": 396000 }, { "epoch": 0.8299979904210067, "grad_norm": 29.696725845336914, "learning_rate": 4.5946200221517395e-05, "loss": 3.1019, "step": 396500 }, { "epoch": 0.8310446461466323, "grad_norm": 29.071367263793945, "learning_rate": 4.5940955965027106e-05, "loss": 3.0963, "step": 397000 }, { "epoch": 0.8320913018722578, "grad_norm": 11.528687477111816, "learning_rate": 4.593571170853681e-05, "loss": 3.1136, "step": 397500 }, { "epoch": 0.8331379575978832, "grad_norm": 13.046791076660156, "learning_rate": 4.593046745204652e-05, "loss": 3.1102, "step": 398000 }, { "epoch": 0.8341846133235087, "grad_norm": 20.82904624938965, "learning_rate": 4.5925223195556226e-05, "loss": 3.1038, "step": 398500 }, { "epoch": 0.8352312690491343, "grad_norm": 12.648612022399902, "learning_rate": 4.591997893906594e-05, "loss": 3.0843, "step": 399000 }, { "epoch": 0.8362779247747597, "grad_norm": 18.9842472076416, "learning_rate": 4.591473468257564e-05, "loss": 3.115, "step": 399500 }, { "epoch": 0.8373245805003852, "grad_norm": 11.487064361572266, "learning_rate": 4.590949042608535e-05, "loss": 3.113, "step": 400000 }, { "epoch": 0.8383712362260106, "grad_norm": 16.92980194091797, "learning_rate": 4.5904246169595064e-05, "loss": 3.107, "step": 400500 }, { "epoch": 0.8394178919516362, "grad_norm": 12.553568840026855, "learning_rate": 4.5899001913104775e-05, "loss": 3.0876, "step": 401000 }, { "epoch": 0.8404645476772616, "grad_norm": 14.928557395935059, "learning_rate": 4.589375765661448e-05, "loss": 3.086, "step": 401500 }, { "epoch": 0.8415112034028871, "grad_norm": 14.710841178894043, "learning_rate": 4.588851340012418e-05, "loss": 3.1133, "step": 402000 }, { "epoch": 0.8425578591285126, "grad_norm": 14.191910743713379, "learning_rate": 4.5883269143633894e-05, "loss": 3.1075, "step": 402500 }, { "epoch": 0.843604514854138, "grad_norm": 22.177644729614258, "learning_rate": 4.58780248871436e-05, "loss": 3.0915, "step": 403000 }, { "epoch": 0.8446511705797636, "grad_norm": 14.51025104522705, "learning_rate": 4.587278063065331e-05, "loss": 3.107, "step": 403500 }, { "epoch": 0.845697826305389, "grad_norm": 18.137033462524414, "learning_rate": 4.5867536374163014e-05, "loss": 3.0967, "step": 404000 }, { "epoch": 0.8467444820310145, "grad_norm": 23.605554580688477, "learning_rate": 4.5862292117672725e-05, "loss": 3.108, "step": 404500 }, { "epoch": 0.84779113775664, "grad_norm": 32.6102409362793, "learning_rate": 4.5857047861182436e-05, "loss": 3.0928, "step": 405000 }, { "epoch": 0.8488377934822655, "grad_norm": 27.17864418029785, "learning_rate": 4.585180360469215e-05, "loss": 3.0898, "step": 405500 }, { "epoch": 0.849884449207891, "grad_norm": 17.892641067504883, "learning_rate": 4.584655934820185e-05, "loss": 3.089, "step": 406000 }, { "epoch": 0.8509311049335164, "grad_norm": 43.79111099243164, "learning_rate": 4.584131509171156e-05, "loss": 3.0824, "step": 406500 }, { "epoch": 0.8519777606591419, "grad_norm": 41.67380142211914, "learning_rate": 4.583607083522127e-05, "loss": 3.087, "step": 407000 }, { "epoch": 0.8530244163847673, "grad_norm": 14.559005737304688, "learning_rate": 4.583082657873098e-05, "loss": 3.079, "step": 407500 }, { "epoch": 0.8540710721103929, "grad_norm": 45.330467224121094, "learning_rate": 4.582558232224068e-05, "loss": 3.0858, "step": 408000 }, { "epoch": 0.8551177278360184, "grad_norm": 16.251359939575195, "learning_rate": 4.582033806575039e-05, "loss": 3.1099, "step": 408500 }, { "epoch": 0.8561643835616438, "grad_norm": 18.156383514404297, "learning_rate": 4.5815093809260105e-05, "loss": 3.1152, "step": 409000 }, { "epoch": 0.8572110392872693, "grad_norm": 13.647897720336914, "learning_rate": 4.580984955276981e-05, "loss": 3.1043, "step": 409500 }, { "epoch": 0.8582576950128948, "grad_norm": 14.120811462402344, "learning_rate": 4.580460529627952e-05, "loss": 3.0821, "step": 410000 }, { "epoch": 0.8593043507385203, "grad_norm": 58.999176025390625, "learning_rate": 4.5799361039789224e-05, "loss": 3.1088, "step": 410500 }, { "epoch": 0.8603510064641458, "grad_norm": 13.82067584991455, "learning_rate": 4.5794116783298935e-05, "loss": 3.1271, "step": 411000 }, { "epoch": 0.8613976621897712, "grad_norm": 23.34807777404785, "learning_rate": 4.578887252680864e-05, "loss": 3.0981, "step": 411500 }, { "epoch": 0.8624443179153968, "grad_norm": 12.234987258911133, "learning_rate": 4.578362827031835e-05, "loss": 3.1132, "step": 412000 }, { "epoch": 0.8634909736410222, "grad_norm": 11.78911018371582, "learning_rate": 4.5778384013828055e-05, "loss": 3.1198, "step": 412500 }, { "epoch": 0.8645376293666477, "grad_norm": 13.056859016418457, "learning_rate": 4.5773139757337766e-05, "loss": 3.0789, "step": 413000 }, { "epoch": 0.8655842850922731, "grad_norm": 10.18702220916748, "learning_rate": 4.576789550084748e-05, "loss": 3.1081, "step": 413500 }, { "epoch": 0.8666309408178986, "grad_norm": 57.595191955566406, "learning_rate": 4.576265124435718e-05, "loss": 3.0971, "step": 414000 }, { "epoch": 0.8676775965435242, "grad_norm": 17.224517822265625, "learning_rate": 4.575740698786689e-05, "loss": 3.1079, "step": 414500 }, { "epoch": 0.8687242522691496, "grad_norm": 12.786349296569824, "learning_rate": 4.57521627313766e-05, "loss": 3.0679, "step": 415000 }, { "epoch": 0.8697709079947751, "grad_norm": 14.679515838623047, "learning_rate": 4.574691847488631e-05, "loss": 3.0749, "step": 415500 }, { "epoch": 0.8708175637204005, "grad_norm": 13.246073722839355, "learning_rate": 4.574167421839601e-05, "loss": 3.0823, "step": 416000 }, { "epoch": 0.8718642194460261, "grad_norm": 21.75094985961914, "learning_rate": 4.573642996190572e-05, "loss": 3.0715, "step": 416500 }, { "epoch": 0.8729108751716516, "grad_norm": 17.223360061645508, "learning_rate": 4.573118570541543e-05, "loss": 3.0846, "step": 417000 }, { "epoch": 0.873957530897277, "grad_norm": 14.954916000366211, "learning_rate": 4.572594144892514e-05, "loss": 3.0939, "step": 417500 }, { "epoch": 0.8750041866229025, "grad_norm": 11.61034870147705, "learning_rate": 4.572069719243485e-05, "loss": 3.092, "step": 418000 }, { "epoch": 0.876050842348528, "grad_norm": 39.456031799316406, "learning_rate": 4.571545293594456e-05, "loss": 3.0765, "step": 418500 }, { "epoch": 0.8770974980741535, "grad_norm": 16.497331619262695, "learning_rate": 4.5710208679454265e-05, "loss": 3.1046, "step": 419000 }, { "epoch": 0.878144153799779, "grad_norm": 12.329867362976074, "learning_rate": 4.570496442296397e-05, "loss": 3.0779, "step": 419500 }, { "epoch": 0.8791908095254044, "grad_norm": 13.026203155517578, "learning_rate": 4.569972016647368e-05, "loss": 3.0968, "step": 420000 }, { "epoch": 0.8802374652510299, "grad_norm": 14.13266372680664, "learning_rate": 4.5694475909983385e-05, "loss": 3.0858, "step": 420500 }, { "epoch": 0.8812841209766554, "grad_norm": 22.775737762451172, "learning_rate": 4.5689231653493096e-05, "loss": 3.1063, "step": 421000 }, { "epoch": 0.8823307767022809, "grad_norm": 42.881046295166016, "learning_rate": 4.56839873970028e-05, "loss": 3.0711, "step": 421500 }, { "epoch": 0.8833774324279063, "grad_norm": 15.743260383605957, "learning_rate": 4.567874314051251e-05, "loss": 3.0844, "step": 422000 }, { "epoch": 0.8844240881535318, "grad_norm": 14.506000518798828, "learning_rate": 4.567349888402222e-05, "loss": 3.0687, "step": 422500 }, { "epoch": 0.8854707438791574, "grad_norm": 26.532411575317383, "learning_rate": 4.5668254627531933e-05, "loss": 3.0546, "step": 423000 }, { "epoch": 0.8865173996047828, "grad_norm": 42.49726867675781, "learning_rate": 4.566301037104164e-05, "loss": 3.0715, "step": 423500 }, { "epoch": 0.8875640553304083, "grad_norm": 53.040157318115234, "learning_rate": 4.565776611455135e-05, "loss": 3.0692, "step": 424000 }, { "epoch": 0.8886107110560337, "grad_norm": 28.18545913696289, "learning_rate": 4.565252185806105e-05, "loss": 3.0559, "step": 424500 }, { "epoch": 0.8896573667816593, "grad_norm": 85.22300720214844, "learning_rate": 4.564727760157076e-05, "loss": 3.0997, "step": 425000 }, { "epoch": 0.8907040225072848, "grad_norm": 57.55947494506836, "learning_rate": 4.564203334508047e-05, "loss": 3.0819, "step": 425500 }, { "epoch": 0.8917506782329102, "grad_norm": 25.03670310974121, "learning_rate": 4.563678908859017e-05, "loss": 3.0788, "step": 426000 }, { "epoch": 0.8927973339585357, "grad_norm": 15.23833179473877, "learning_rate": 4.5631544832099884e-05, "loss": 3.088, "step": 426500 }, { "epoch": 0.8938439896841611, "grad_norm": 52.148494720458984, "learning_rate": 4.5626300575609595e-05, "loss": 3.1052, "step": 427000 }, { "epoch": 0.8948906454097867, "grad_norm": 41.578529357910156, "learning_rate": 4.5621056319119306e-05, "loss": 3.1053, "step": 427500 }, { "epoch": 0.8959373011354121, "grad_norm": 68.60832214355469, "learning_rate": 4.561581206262901e-05, "loss": 3.0669, "step": 428000 }, { "epoch": 0.8969839568610376, "grad_norm": 15.874756813049316, "learning_rate": 4.561056780613872e-05, "loss": 3.0996, "step": 428500 }, { "epoch": 0.8980306125866631, "grad_norm": 22.627355575561523, "learning_rate": 4.5605323549648426e-05, "loss": 3.0884, "step": 429000 }, { "epoch": 0.8990772683122886, "grad_norm": 19.678470611572266, "learning_rate": 4.560007929315814e-05, "loss": 3.0859, "step": 429500 }, { "epoch": 0.9001239240379141, "grad_norm": 13.891215324401855, "learning_rate": 4.559483503666784e-05, "loss": 3.0922, "step": 430000 }, { "epoch": 0.9011705797635395, "grad_norm": 20.770320892333984, "learning_rate": 4.558959078017755e-05, "loss": 3.0428, "step": 430500 }, { "epoch": 0.902217235489165, "grad_norm": 52.867591857910156, "learning_rate": 4.558434652368726e-05, "loss": 3.0607, "step": 431000 }, { "epoch": 0.9032638912147904, "grad_norm": 18.523019790649414, "learning_rate": 4.557910226719697e-05, "loss": 3.0971, "step": 431500 }, { "epoch": 0.904310546940416, "grad_norm": 17.392412185668945, "learning_rate": 4.557385801070668e-05, "loss": 3.0689, "step": 432000 }, { "epoch": 0.9053572026660415, "grad_norm": 355.19708251953125, "learning_rate": 4.556861375421638e-05, "loss": 3.0734, "step": 432500 }, { "epoch": 0.9064038583916669, "grad_norm": 13.078202247619629, "learning_rate": 4.5563369497726094e-05, "loss": 3.0725, "step": 433000 }, { "epoch": 0.9074505141172924, "grad_norm": 19.954246520996094, "learning_rate": 4.55581252412358e-05, "loss": 3.0874, "step": 433500 }, { "epoch": 0.908497169842918, "grad_norm": 20.269153594970703, "learning_rate": 4.555288098474551e-05, "loss": 3.0735, "step": 434000 }, { "epoch": 0.9095438255685434, "grad_norm": 15.713488578796387, "learning_rate": 4.5547636728255214e-05, "loss": 3.079, "step": 434500 }, { "epoch": 0.9105904812941689, "grad_norm": 20.711267471313477, "learning_rate": 4.5542392471764925e-05, "loss": 3.0958, "step": 435000 }, { "epoch": 0.9116371370197943, "grad_norm": 128.25698852539062, "learning_rate": 4.5537148215274636e-05, "loss": 3.0851, "step": 435500 }, { "epoch": 0.9126837927454199, "grad_norm": 15.889130592346191, "learning_rate": 4.553190395878435e-05, "loss": 3.0888, "step": 436000 }, { "epoch": 0.9137304484710453, "grad_norm": 30.82235336303711, "learning_rate": 4.552665970229405e-05, "loss": 3.0972, "step": 436500 }, { "epoch": 0.9147771041966708, "grad_norm": 33.441490173339844, "learning_rate": 4.5521415445803756e-05, "loss": 3.0787, "step": 437000 }, { "epoch": 0.9158237599222963, "grad_norm": 27.07833480834961, "learning_rate": 4.551617118931347e-05, "loss": 3.0928, "step": 437500 }, { "epoch": 0.9168704156479217, "grad_norm": 16.609750747680664, "learning_rate": 4.551092693282317e-05, "loss": 3.0762, "step": 438000 }, { "epoch": 0.9179170713735473, "grad_norm": 20.565338134765625, "learning_rate": 4.550568267633288e-05, "loss": 3.0836, "step": 438500 }, { "epoch": 0.9189637270991727, "grad_norm": 13.191825866699219, "learning_rate": 4.5500438419842586e-05, "loss": 3.0691, "step": 439000 }, { "epoch": 0.9200103828247982, "grad_norm": 31.36122703552246, "learning_rate": 4.54951941633523e-05, "loss": 3.0859, "step": 439500 }, { "epoch": 0.9210570385504236, "grad_norm": 15.214014053344727, "learning_rate": 4.548994990686201e-05, "loss": 3.0682, "step": 440000 }, { "epoch": 0.9221036942760492, "grad_norm": 36.92890930175781, "learning_rate": 4.548470565037172e-05, "loss": 3.0778, "step": 440500 }, { "epoch": 0.9231503500016747, "grad_norm": 26.769058227539062, "learning_rate": 4.5479461393881424e-05, "loss": 3.0735, "step": 441000 }, { "epoch": 0.9241970057273001, "grad_norm": 13.002591133117676, "learning_rate": 4.5474217137391135e-05, "loss": 3.0724, "step": 441500 }, { "epoch": 0.9252436614529256, "grad_norm": 11.656171798706055, "learning_rate": 4.546897288090084e-05, "loss": 3.0602, "step": 442000 }, { "epoch": 0.9262903171785511, "grad_norm": 13.679676055908203, "learning_rate": 4.5463728624410544e-05, "loss": 3.0781, "step": 442500 }, { "epoch": 0.9273369729041766, "grad_norm": 20.03319549560547, "learning_rate": 4.5458484367920255e-05, "loss": 3.0666, "step": 443000 }, { "epoch": 0.9283836286298021, "grad_norm": 16.333377838134766, "learning_rate": 4.545324011142996e-05, "loss": 3.0651, "step": 443500 }, { "epoch": 0.9294302843554275, "grad_norm": 12.873374938964844, "learning_rate": 4.544799585493967e-05, "loss": 3.0658, "step": 444000 }, { "epoch": 0.930476940081053, "grad_norm": 14.69632625579834, "learning_rate": 4.544275159844938e-05, "loss": 3.0768, "step": 444500 }, { "epoch": 0.9315235958066785, "grad_norm": 19.797334671020508, "learning_rate": 4.543750734195909e-05, "loss": 3.0724, "step": 445000 }, { "epoch": 0.932570251532304, "grad_norm": 157.45033264160156, "learning_rate": 4.5432263085468797e-05, "loss": 3.0503, "step": 445500 }, { "epoch": 0.9336169072579295, "grad_norm": 18.04957389831543, "learning_rate": 4.542701882897851e-05, "loss": 3.0338, "step": 446000 }, { "epoch": 0.9346635629835549, "grad_norm": 18.310131072998047, "learning_rate": 4.542177457248821e-05, "loss": 3.066, "step": 446500 }, { "epoch": 0.9357102187091805, "grad_norm": 12.882999420166016, "learning_rate": 4.541653031599792e-05, "loss": 3.0571, "step": 447000 }, { "epoch": 0.9367568744348059, "grad_norm": 27.992361068725586, "learning_rate": 4.541128605950763e-05, "loss": 3.0664, "step": 447500 }, { "epoch": 0.9378035301604314, "grad_norm": 16.282066345214844, "learning_rate": 4.540604180301734e-05, "loss": 3.0722, "step": 448000 }, { "epoch": 0.9388501858860568, "grad_norm": 47.99612808227539, "learning_rate": 4.540079754652705e-05, "loss": 3.0715, "step": 448500 }, { "epoch": 0.9398968416116823, "grad_norm": 73.8532943725586, "learning_rate": 4.5395553290036754e-05, "loss": 3.0644, "step": 449000 }, { "epoch": 0.9409434973373079, "grad_norm": 20.446918487548828, "learning_rate": 4.5390309033546465e-05, "loss": 3.0716, "step": 449500 }, { "epoch": 0.9419901530629333, "grad_norm": 14.73962688446045, "learning_rate": 4.538506477705617e-05, "loss": 3.0772, "step": 450000 }, { "epoch": 0.9430368087885588, "grad_norm": 15.296753883361816, "learning_rate": 4.537982052056588e-05, "loss": 3.0726, "step": 450500 }, { "epoch": 0.9440834645141842, "grad_norm": 14.4580659866333, "learning_rate": 4.5374576264075584e-05, "loss": 3.0829, "step": 451000 }, { "epoch": 0.9451301202398098, "grad_norm": 22.72251319885254, "learning_rate": 4.5369332007585296e-05, "loss": 3.0573, "step": 451500 }, { "epoch": 0.9461767759654353, "grad_norm": 13.298014640808105, "learning_rate": 4.5364087751095e-05, "loss": 3.0841, "step": 452000 }, { "epoch": 0.9472234316910607, "grad_norm": 13.757491111755371, "learning_rate": 4.535884349460471e-05, "loss": 3.0544, "step": 452500 }, { "epoch": 0.9482700874166862, "grad_norm": 15.46120834350586, "learning_rate": 4.535359923811442e-05, "loss": 3.0634, "step": 453000 }, { "epoch": 0.9493167431423117, "grad_norm": 15.59118938446045, "learning_rate": 4.5348354981624126e-05, "loss": 3.0625, "step": 453500 }, { "epoch": 0.9503633988679372, "grad_norm": 12.76987361907959, "learning_rate": 4.534311072513384e-05, "loss": 3.0652, "step": 454000 }, { "epoch": 0.9514100545935626, "grad_norm": 44.825439453125, "learning_rate": 4.533786646864354e-05, "loss": 3.0369, "step": 454500 }, { "epoch": 0.9524567103191881, "grad_norm": 14.761260032653809, "learning_rate": 4.533262221215325e-05, "loss": 3.0805, "step": 455000 }, { "epoch": 0.9535033660448136, "grad_norm": 25.465566635131836, "learning_rate": 4.532737795566296e-05, "loss": 3.0826, "step": 455500 }, { "epoch": 0.9545500217704391, "grad_norm": 31.851551055908203, "learning_rate": 4.532213369917267e-05, "loss": 3.0494, "step": 456000 }, { "epoch": 0.9555966774960646, "grad_norm": 11.805193901062012, "learning_rate": 4.531688944268237e-05, "loss": 3.0339, "step": 456500 }, { "epoch": 0.95664333322169, "grad_norm": 17.05167007446289, "learning_rate": 4.5311645186192084e-05, "loss": 3.0681, "step": 457000 }, { "epoch": 0.9576899889473155, "grad_norm": 28.52716827392578, "learning_rate": 4.5306400929701795e-05, "loss": 3.0832, "step": 457500 }, { "epoch": 0.9587366446729411, "grad_norm": 15.145355224609375, "learning_rate": 4.5301156673211506e-05, "loss": 3.0618, "step": 458000 }, { "epoch": 0.9597833003985665, "grad_norm": 16.647741317749023, "learning_rate": 4.529591241672121e-05, "loss": 3.0416, "step": 458500 }, { "epoch": 0.960829956124192, "grad_norm": 15.798673629760742, "learning_rate": 4.5290668160230914e-05, "loss": 3.0657, "step": 459000 }, { "epoch": 0.9618766118498174, "grad_norm": 11.975316047668457, "learning_rate": 4.5285423903740625e-05, "loss": 3.0296, "step": 459500 }, { "epoch": 0.962923267575443, "grad_norm": 14.91629695892334, "learning_rate": 4.528017964725033e-05, "loss": 3.0414, "step": 460000 }, { "epoch": 0.9639699233010685, "grad_norm": 12.685609817504883, "learning_rate": 4.527493539076004e-05, "loss": 3.0616, "step": 460500 }, { "epoch": 0.9650165790266939, "grad_norm": 23.680830001831055, "learning_rate": 4.5269691134269745e-05, "loss": 3.0517, "step": 461000 }, { "epoch": 0.9660632347523194, "grad_norm": 13.528060913085938, "learning_rate": 4.5264446877779456e-05, "loss": 3.0326, "step": 461500 }, { "epoch": 0.9671098904779448, "grad_norm": 13.006345748901367, "learning_rate": 4.525920262128917e-05, "loss": 3.0372, "step": 462000 }, { "epoch": 0.9681565462035704, "grad_norm": 19.67546844482422, "learning_rate": 4.525395836479888e-05, "loss": 3.0357, "step": 462500 }, { "epoch": 0.9692032019291958, "grad_norm": 12.822738647460938, "learning_rate": 4.524871410830858e-05, "loss": 3.0607, "step": 463000 }, { "epoch": 0.9702498576548213, "grad_norm": 16.026735305786133, "learning_rate": 4.5243469851818294e-05, "loss": 3.0618, "step": 463500 }, { "epoch": 0.9712965133804468, "grad_norm": 15.216615676879883, "learning_rate": 4.5238225595328e-05, "loss": 3.0345, "step": 464000 }, { "epoch": 0.9723431691060723, "grad_norm": 19.85114288330078, "learning_rate": 4.52329813388377e-05, "loss": 3.0459, "step": 464500 }, { "epoch": 0.9733898248316978, "grad_norm": 16.159255981445312, "learning_rate": 4.5227737082347413e-05, "loss": 3.0374, "step": 465000 }, { "epoch": 0.9744364805573232, "grad_norm": 16.31018829345703, "learning_rate": 4.5222492825857124e-05, "loss": 3.0666, "step": 465500 }, { "epoch": 0.9754831362829487, "grad_norm": 18.721643447875977, "learning_rate": 4.5217248569366836e-05, "loss": 3.0513, "step": 466000 }, { "epoch": 0.9765297920085741, "grad_norm": 19.4873104095459, "learning_rate": 4.521200431287654e-05, "loss": 3.0422, "step": 466500 }, { "epoch": 0.9775764477341997, "grad_norm": 21.470741271972656, "learning_rate": 4.520676005638625e-05, "loss": 3.0542, "step": 467000 }, { "epoch": 0.9786231034598252, "grad_norm": 14.990913391113281, "learning_rate": 4.5201515799895955e-05, "loss": 3.0149, "step": 467500 }, { "epoch": 0.9796697591854506, "grad_norm": 12.022135734558105, "learning_rate": 4.5196271543405666e-05, "loss": 3.0218, "step": 468000 }, { "epoch": 0.9807164149110761, "grad_norm": 18.70465660095215, "learning_rate": 4.519102728691537e-05, "loss": 3.0532, "step": 468500 }, { "epoch": 0.9817630706367017, "grad_norm": 32.09714889526367, "learning_rate": 4.518578303042508e-05, "loss": 3.0486, "step": 469000 }, { "epoch": 0.9828097263623271, "grad_norm": 21.929746627807617, "learning_rate": 4.5180538773934786e-05, "loss": 3.0318, "step": 469500 }, { "epoch": 0.9838563820879526, "grad_norm": 14.956615447998047, "learning_rate": 4.51752945174445e-05, "loss": 3.0647, "step": 470000 }, { "epoch": 0.984903037813578, "grad_norm": 12.855756759643555, "learning_rate": 4.517005026095421e-05, "loss": 3.0396, "step": 470500 }, { "epoch": 0.9859496935392036, "grad_norm": 14.026811599731445, "learning_rate": 4.516480600446391e-05, "loss": 3.0427, "step": 471000 }, { "epoch": 0.986996349264829, "grad_norm": 15.396981239318848, "learning_rate": 4.5159561747973624e-05, "loss": 3.0419, "step": 471500 }, { "epoch": 0.9880430049904545, "grad_norm": 12.36718463897705, "learning_rate": 4.515431749148333e-05, "loss": 3.0453, "step": 472000 }, { "epoch": 0.98908966071608, "grad_norm": 14.636885643005371, "learning_rate": 4.514907323499304e-05, "loss": 3.0415, "step": 472500 }, { "epoch": 0.9901363164417054, "grad_norm": 13.592967987060547, "learning_rate": 4.514382897850274e-05, "loss": 3.0224, "step": 473000 }, { "epoch": 0.991182972167331, "grad_norm": 12.09045696258545, "learning_rate": 4.5138584722012454e-05, "loss": 3.0241, "step": 473500 }, { "epoch": 0.9922296278929564, "grad_norm": 14.95113468170166, "learning_rate": 4.513334046552216e-05, "loss": 3.0251, "step": 474000 }, { "epoch": 0.9932762836185819, "grad_norm": 14.160531997680664, "learning_rate": 4.512809620903187e-05, "loss": 3.043, "step": 474500 }, { "epoch": 0.9943229393442073, "grad_norm": 13.18771743774414, "learning_rate": 4.512285195254158e-05, "loss": 3.0101, "step": 475000 }, { "epoch": 0.9953695950698329, "grad_norm": 27.70610809326172, "learning_rate": 4.511760769605129e-05, "loss": 3.057, "step": 475500 }, { "epoch": 0.9964162507954584, "grad_norm": 33.5693244934082, "learning_rate": 4.5112363439560996e-05, "loss": 2.9994, "step": 476000 }, { "epoch": 0.9974629065210838, "grad_norm": 11.691963195800781, "learning_rate": 4.51071191830707e-05, "loss": 3.0185, "step": 476500 }, { "epoch": 0.9985095622467093, "grad_norm": 26.087095260620117, "learning_rate": 4.510187492658041e-05, "loss": 3.0352, "step": 477000 }, { "epoch": 0.9995562179723348, "grad_norm": 12.059103965759277, "learning_rate": 4.5096630670090116e-05, "loss": 3.0334, "step": 477500 }, { "epoch": 1.0006028736979602, "grad_norm": 26.19084930419922, "learning_rate": 4.509138641359983e-05, "loss": 3.0369, "step": 478000 }, { "epoch": 1.0016495294235856, "grad_norm": 16.75862693786621, "learning_rate": 4.508614215710953e-05, "loss": 3.0394, "step": 478500 }, { "epoch": 1.0026961851492113, "grad_norm": 13.32697582244873, "learning_rate": 4.508089790061924e-05, "loss": 3.0104, "step": 479000 }, { "epoch": 1.0037428408748368, "grad_norm": 25.445621490478516, "learning_rate": 4.507565364412895e-05, "loss": 3.0287, "step": 479500 }, { "epoch": 1.0047894966004622, "grad_norm": 12.598262786865234, "learning_rate": 4.5070409387638664e-05, "loss": 3.029, "step": 480000 }, { "epoch": 1.0058361523260877, "grad_norm": 16.800689697265625, "learning_rate": 4.506516513114837e-05, "loss": 3.0398, "step": 480500 }, { "epoch": 1.0068828080517132, "grad_norm": 57.17509841918945, "learning_rate": 4.505992087465808e-05, "loss": 3.0112, "step": 481000 }, { "epoch": 1.0079294637773386, "grad_norm": 10.812345504760742, "learning_rate": 4.5054676618167784e-05, "loss": 3.0207, "step": 481500 }, { "epoch": 1.008976119502964, "grad_norm": 11.172477722167969, "learning_rate": 4.504943236167749e-05, "loss": 3.0252, "step": 482000 }, { "epoch": 1.0100227752285895, "grad_norm": 13.186237335205078, "learning_rate": 4.50441881051872e-05, "loss": 3.0179, "step": 482500 }, { "epoch": 1.0110694309542152, "grad_norm": 54.3747444152832, "learning_rate": 4.503894384869691e-05, "loss": 3.0193, "step": 483000 }, { "epoch": 1.0121160866798407, "grad_norm": 25.733501434326172, "learning_rate": 4.503369959220662e-05, "loss": 3.0763, "step": 483500 }, { "epoch": 1.013162742405466, "grad_norm": 19.508371353149414, "learning_rate": 4.5028455335716326e-05, "loss": 3.0269, "step": 484000 }, { "epoch": 1.0142093981310916, "grad_norm": 11.77624225616455, "learning_rate": 4.502321107922604e-05, "loss": 3.0027, "step": 484500 }, { "epoch": 1.015256053856717, "grad_norm": 17.4995059967041, "learning_rate": 4.501796682273574e-05, "loss": 3.0273, "step": 485000 }, { "epoch": 1.0163027095823425, "grad_norm": 23.062786102294922, "learning_rate": 4.501272256624545e-05, "loss": 3.0262, "step": 485500 }, { "epoch": 1.017349365307968, "grad_norm": 13.89450740814209, "learning_rate": 4.500747830975516e-05, "loss": 3.017, "step": 486000 }, { "epoch": 1.0183960210335934, "grad_norm": 19.24853515625, "learning_rate": 4.500223405326487e-05, "loss": 3.0681, "step": 486500 }, { "epoch": 1.0194426767592188, "grad_norm": 12.250215530395508, "learning_rate": 4.499698979677457e-05, "loss": 3.0289, "step": 487000 }, { "epoch": 1.0204893324848445, "grad_norm": 48.8430061340332, "learning_rate": 4.499174554028428e-05, "loss": 3.0394, "step": 487500 }, { "epoch": 1.02153598821047, "grad_norm": 13.814092636108398, "learning_rate": 4.4986501283793994e-05, "loss": 3.0227, "step": 488000 }, { "epoch": 1.0225826439360954, "grad_norm": 22.870561599731445, "learning_rate": 4.49812570273037e-05, "loss": 3.0271, "step": 488500 }, { "epoch": 1.023629299661721, "grad_norm": 14.448551177978516, "learning_rate": 4.497601277081341e-05, "loss": 3.0304, "step": 489000 }, { "epoch": 1.0246759553873463, "grad_norm": 15.263923645019531, "learning_rate": 4.4970768514323114e-05, "loss": 3.0175, "step": 489500 }, { "epoch": 1.0257226111129718, "grad_norm": 36.70048141479492, "learning_rate": 4.4965524257832825e-05, "loss": 3.0136, "step": 490000 }, { "epoch": 1.0267692668385973, "grad_norm": 12.746089935302734, "learning_rate": 4.496028000134253e-05, "loss": 3.0052, "step": 490500 }, { "epoch": 1.0278159225642227, "grad_norm": 13.015069961547852, "learning_rate": 4.495503574485224e-05, "loss": 3.0248, "step": 491000 }, { "epoch": 1.0288625782898482, "grad_norm": 13.977209091186523, "learning_rate": 4.4949791488361945e-05, "loss": 3.0124, "step": 491500 }, { "epoch": 1.0299092340154739, "grad_norm": 31.213613510131836, "learning_rate": 4.4944547231871656e-05, "loss": 3.0111, "step": 492000 }, { "epoch": 1.0309558897410993, "grad_norm": 13.574820518493652, "learning_rate": 4.493930297538137e-05, "loss": 3.0153, "step": 492500 }, { "epoch": 1.0320025454667248, "grad_norm": 15.539687156677246, "learning_rate": 4.493405871889107e-05, "loss": 3.0219, "step": 493000 }, { "epoch": 1.0330492011923502, "grad_norm": 12.682628631591797, "learning_rate": 4.492881446240078e-05, "loss": 3.0256, "step": 493500 }, { "epoch": 1.0340958569179757, "grad_norm": 14.135174751281738, "learning_rate": 4.4923570205910487e-05, "loss": 2.9992, "step": 494000 }, { "epoch": 1.0351425126436011, "grad_norm": 12.935396194458008, "learning_rate": 4.49183259494202e-05, "loss": 3.0315, "step": 494500 }, { "epoch": 1.0361891683692266, "grad_norm": 32.11921310424805, "learning_rate": 4.49130816929299e-05, "loss": 3.0155, "step": 495000 }, { "epoch": 1.037235824094852, "grad_norm": 121.37078094482422, "learning_rate": 4.490783743643961e-05, "loss": 3.0156, "step": 495500 }, { "epoch": 1.0382824798204777, "grad_norm": 15.59047794342041, "learning_rate": 4.490259317994932e-05, "loss": 3.0128, "step": 496000 }, { "epoch": 1.0393291355461032, "grad_norm": 12.541696548461914, "learning_rate": 4.489734892345903e-05, "loss": 3.0278, "step": 496500 }, { "epoch": 1.0403757912717286, "grad_norm": 13.997806549072266, "learning_rate": 4.489210466696874e-05, "loss": 3.0284, "step": 497000 }, { "epoch": 1.041422446997354, "grad_norm": 12.62335205078125, "learning_rate": 4.488686041047845e-05, "loss": 3.0121, "step": 497500 }, { "epoch": 1.0424691027229795, "grad_norm": 22.37249755859375, "learning_rate": 4.4881616153988155e-05, "loss": 3.0129, "step": 498000 }, { "epoch": 1.043515758448605, "grad_norm": 21.653587341308594, "learning_rate": 4.487637189749786e-05, "loss": 3.0239, "step": 498500 }, { "epoch": 1.0445624141742305, "grad_norm": 14.070440292358398, "learning_rate": 4.487112764100757e-05, "loss": 3.0081, "step": 499000 }, { "epoch": 1.045609069899856, "grad_norm": 16.635643005371094, "learning_rate": 4.4865883384517275e-05, "loss": 2.9904, "step": 499500 }, { "epoch": 1.0466557256254814, "grad_norm": 19.281082153320312, "learning_rate": 4.4860639128026986e-05, "loss": 3.0033, "step": 500000 }, { "epoch": 1.047702381351107, "grad_norm": 30.87590789794922, "learning_rate": 4.48553948715367e-05, "loss": 3.0048, "step": 500500 }, { "epoch": 1.0487490370767325, "grad_norm": 20.048818588256836, "learning_rate": 4.485015061504641e-05, "loss": 3.0244, "step": 501000 }, { "epoch": 1.049795692802358, "grad_norm": 12.985297203063965, "learning_rate": 4.484490635855611e-05, "loss": 2.9952, "step": 501500 }, { "epoch": 1.0508423485279834, "grad_norm": 15.852160453796387, "learning_rate": 4.483966210206582e-05, "loss": 3.013, "step": 502000 }, { "epoch": 1.0518890042536089, "grad_norm": 46.74729919433594, "learning_rate": 4.483441784557553e-05, "loss": 3.0046, "step": 502500 }, { "epoch": 1.0529356599792343, "grad_norm": 11.094769477844238, "learning_rate": 4.482917358908524e-05, "loss": 3.0266, "step": 503000 }, { "epoch": 1.0539823157048598, "grad_norm": 17.936908721923828, "learning_rate": 4.482392933259494e-05, "loss": 3.017, "step": 503500 }, { "epoch": 1.0550289714304852, "grad_norm": 22.11948013305664, "learning_rate": 4.4818685076104654e-05, "loss": 3.0292, "step": 504000 }, { "epoch": 1.0560756271561107, "grad_norm": 14.220190048217773, "learning_rate": 4.481344081961436e-05, "loss": 3.0207, "step": 504500 }, { "epoch": 1.0571222828817364, "grad_norm": 13.326883316040039, "learning_rate": 4.480819656312407e-05, "loss": 3.0079, "step": 505000 }, { "epoch": 1.0581689386073618, "grad_norm": 16.47216796875, "learning_rate": 4.480295230663378e-05, "loss": 3.0252, "step": 505500 }, { "epoch": 1.0592155943329873, "grad_norm": 11.92165470123291, "learning_rate": 4.4797708050143485e-05, "loss": 2.9965, "step": 506000 }, { "epoch": 1.0602622500586127, "grad_norm": 12.399568557739258, "learning_rate": 4.4792463793653196e-05, "loss": 3.0131, "step": 506500 }, { "epoch": 1.0613089057842382, "grad_norm": 14.098377227783203, "learning_rate": 4.47872195371629e-05, "loss": 3.0159, "step": 507000 }, { "epoch": 1.0623555615098637, "grad_norm": 14.438943862915039, "learning_rate": 4.478197528067261e-05, "loss": 3.016, "step": 507500 }, { "epoch": 1.0634022172354891, "grad_norm": 21.75257110595703, "learning_rate": 4.4776731024182316e-05, "loss": 2.988, "step": 508000 }, { "epoch": 1.0644488729611146, "grad_norm": 29.110692977905273, "learning_rate": 4.4771486767692027e-05, "loss": 3.0014, "step": 508500 }, { "epoch": 1.06549552868674, "grad_norm": 12.922900199890137, "learning_rate": 4.476624251120173e-05, "loss": 3.0086, "step": 509000 }, { "epoch": 1.0665421844123657, "grad_norm": 10.247533798217773, "learning_rate": 4.476099825471144e-05, "loss": 3.0121, "step": 509500 }, { "epoch": 1.0675888401379912, "grad_norm": 23.4642276763916, "learning_rate": 4.475575399822115e-05, "loss": 3.0156, "step": 510000 }, { "epoch": 1.0686354958636166, "grad_norm": 12.800028800964355, "learning_rate": 4.475050974173086e-05, "loss": 2.9956, "step": 510500 }, { "epoch": 1.069682151589242, "grad_norm": 20.463367462158203, "learning_rate": 4.474526548524057e-05, "loss": 2.9995, "step": 511000 }, { "epoch": 1.0707288073148675, "grad_norm": 16.456207275390625, "learning_rate": 4.474002122875027e-05, "loss": 3.0073, "step": 511500 }, { "epoch": 1.071775463040493, "grad_norm": 12.476139068603516, "learning_rate": 4.4734776972259984e-05, "loss": 3.0182, "step": 512000 }, { "epoch": 1.0728221187661184, "grad_norm": 15.78511905670166, "learning_rate": 4.472953271576969e-05, "loss": 2.9931, "step": 512500 }, { "epoch": 1.073868774491744, "grad_norm": 12.59097671508789, "learning_rate": 4.47242884592794e-05, "loss": 3.0029, "step": 513000 }, { "epoch": 1.0749154302173696, "grad_norm": 11.312922477722168, "learning_rate": 4.4719044202789103e-05, "loss": 3.0109, "step": 513500 }, { "epoch": 1.075962085942995, "grad_norm": 12.595751762390137, "learning_rate": 4.4713799946298815e-05, "loss": 3.0003, "step": 514000 }, { "epoch": 1.0770087416686205, "grad_norm": 55.56752395629883, "learning_rate": 4.4708555689808526e-05, "loss": 3.0063, "step": 514500 }, { "epoch": 1.078055397394246, "grad_norm": 16.058473587036133, "learning_rate": 4.470331143331824e-05, "loss": 3.0107, "step": 515000 }, { "epoch": 1.0791020531198714, "grad_norm": 20.472576141357422, "learning_rate": 4.469806717682794e-05, "loss": 2.9777, "step": 515500 }, { "epoch": 1.0801487088454969, "grad_norm": 11.844259262084961, "learning_rate": 4.4692822920337645e-05, "loss": 2.9888, "step": 516000 }, { "epoch": 1.0811953645711223, "grad_norm": 17.13465690612793, "learning_rate": 4.4687578663847356e-05, "loss": 3.0171, "step": 516500 }, { "epoch": 1.0822420202967478, "grad_norm": 14.261621475219727, "learning_rate": 4.468233440735706e-05, "loss": 2.9781, "step": 517000 }, { "epoch": 1.0832886760223732, "grad_norm": 13.783784866333008, "learning_rate": 4.467709015086677e-05, "loss": 2.9738, "step": 517500 }, { "epoch": 1.0843353317479987, "grad_norm": 12.46524429321289, "learning_rate": 4.4671845894376476e-05, "loss": 3.0117, "step": 518000 }, { "epoch": 1.0853819874736244, "grad_norm": 12.254754066467285, "learning_rate": 4.4666601637886194e-05, "loss": 2.9816, "step": 518500 }, { "epoch": 1.0864286431992498, "grad_norm": 12.166215896606445, "learning_rate": 4.46613573813959e-05, "loss": 2.9923, "step": 519000 }, { "epoch": 1.0874752989248753, "grad_norm": 13.601927757263184, "learning_rate": 4.465611312490561e-05, "loss": 2.9982, "step": 519500 }, { "epoch": 1.0885219546505007, "grad_norm": 14.375411033630371, "learning_rate": 4.4650868868415314e-05, "loss": 2.9938, "step": 520000 }, { "epoch": 1.0895686103761262, "grad_norm": 142.57765197753906, "learning_rate": 4.4645624611925025e-05, "loss": 3.0014, "step": 520500 }, { "epoch": 1.0906152661017516, "grad_norm": 13.679147720336914, "learning_rate": 4.464038035543473e-05, "loss": 3.0016, "step": 521000 }, { "epoch": 1.091661921827377, "grad_norm": 14.07653522491455, "learning_rate": 4.463513609894443e-05, "loss": 2.9968, "step": 521500 }, { "epoch": 1.0927085775530025, "grad_norm": 14.384284019470215, "learning_rate": 4.4629891842454144e-05, "loss": 2.974, "step": 522000 }, { "epoch": 1.0937552332786282, "grad_norm": 18.943147659301758, "learning_rate": 4.4624647585963855e-05, "loss": 3.0014, "step": 522500 }, { "epoch": 1.0948018890042537, "grad_norm": 17.254920959472656, "learning_rate": 4.4619403329473567e-05, "loss": 3.0004, "step": 523000 }, { "epoch": 1.0958485447298791, "grad_norm": 17.326061248779297, "learning_rate": 4.461415907298327e-05, "loss": 2.9896, "step": 523500 }, { "epoch": 1.0968952004555046, "grad_norm": 16.01383399963379, "learning_rate": 4.460891481649298e-05, "loss": 3.0053, "step": 524000 }, { "epoch": 1.09794185618113, "grad_norm": 12.539518356323242, "learning_rate": 4.4603670560002686e-05, "loss": 2.9942, "step": 524500 }, { "epoch": 1.0989885119067555, "grad_norm": 14.947361946105957, "learning_rate": 4.45984263035124e-05, "loss": 3.0026, "step": 525000 }, { "epoch": 1.100035167632381, "grad_norm": 14.667593002319336, "learning_rate": 4.45931820470221e-05, "loss": 2.9975, "step": 525500 }, { "epoch": 1.1010818233580064, "grad_norm": 26.245397567749023, "learning_rate": 4.458793779053181e-05, "loss": 3.0017, "step": 526000 }, { "epoch": 1.1021284790836319, "grad_norm": 62.444007873535156, "learning_rate": 4.458269353404152e-05, "loss": 2.9877, "step": 526500 }, { "epoch": 1.1031751348092576, "grad_norm": 38.53151321411133, "learning_rate": 4.457744927755123e-05, "loss": 2.9982, "step": 527000 }, { "epoch": 1.104221790534883, "grad_norm": 13.515522003173828, "learning_rate": 4.457220502106094e-05, "loss": 3.0111, "step": 527500 }, { "epoch": 1.1052684462605085, "grad_norm": 29.79913902282715, "learning_rate": 4.4566960764570643e-05, "loss": 2.9971, "step": 528000 }, { "epoch": 1.106315101986134, "grad_norm": 22.882030487060547, "learning_rate": 4.4561716508080355e-05, "loss": 3.0122, "step": 528500 }, { "epoch": 1.1073617577117594, "grad_norm": 24.913042068481445, "learning_rate": 4.455647225159006e-05, "loss": 3.0378, "step": 529000 }, { "epoch": 1.1084084134373848, "grad_norm": 25.136837005615234, "learning_rate": 4.455122799509977e-05, "loss": 2.9819, "step": 529500 }, { "epoch": 1.1094550691630103, "grad_norm": 21.583850860595703, "learning_rate": 4.4545983738609474e-05, "loss": 3.0047, "step": 530000 }, { "epoch": 1.1105017248886357, "grad_norm": 17.42920684814453, "learning_rate": 4.4540739482119185e-05, "loss": 2.9807, "step": 530500 }, { "epoch": 1.1115483806142614, "grad_norm": 16.038732528686523, "learning_rate": 4.453549522562889e-05, "loss": 3.0063, "step": 531000 }, { "epoch": 1.1125950363398869, "grad_norm": 12.987587928771973, "learning_rate": 4.45302509691386e-05, "loss": 3.0032, "step": 531500 }, { "epoch": 1.1136416920655123, "grad_norm": 13.974059104919434, "learning_rate": 4.452500671264831e-05, "loss": 3.0154, "step": 532000 }, { "epoch": 1.1146883477911378, "grad_norm": 11.727263450622559, "learning_rate": 4.4519762456158016e-05, "loss": 3.0185, "step": 532500 }, { "epoch": 1.1157350035167632, "grad_norm": 33.058616638183594, "learning_rate": 4.451451819966773e-05, "loss": 3.0026, "step": 533000 }, { "epoch": 1.1167816592423887, "grad_norm": 15.313440322875977, "learning_rate": 4.450927394317743e-05, "loss": 3.0307, "step": 533500 }, { "epoch": 1.1178283149680142, "grad_norm": 12.583412170410156, "learning_rate": 4.450402968668714e-05, "loss": 3.003, "step": 534000 }, { "epoch": 1.1188749706936396, "grad_norm": 13.944348335266113, "learning_rate": 4.449878543019685e-05, "loss": 3.0047, "step": 534500 }, { "epoch": 1.119921626419265, "grad_norm": 15.244293212890625, "learning_rate": 4.449354117370656e-05, "loss": 3.0052, "step": 535000 }, { "epoch": 1.1209682821448907, "grad_norm": 18.96525764465332, "learning_rate": 4.448829691721626e-05, "loss": 3.0064, "step": 535500 }, { "epoch": 1.1220149378705162, "grad_norm": 14.799887657165527, "learning_rate": 4.448305266072598e-05, "loss": 2.9977, "step": 536000 }, { "epoch": 1.1230615935961417, "grad_norm": 15.693257331848145, "learning_rate": 4.4477808404235684e-05, "loss": 2.9813, "step": 536500 }, { "epoch": 1.1241082493217671, "grad_norm": 22.526569366455078, "learning_rate": 4.4472564147745395e-05, "loss": 2.998, "step": 537000 }, { "epoch": 1.1251549050473926, "grad_norm": 16.8995418548584, "learning_rate": 4.44673198912551e-05, "loss": 2.9903, "step": 537500 }, { "epoch": 1.126201560773018, "grad_norm": 51.22627639770508, "learning_rate": 4.446207563476481e-05, "loss": 3.0109, "step": 538000 }, { "epoch": 1.1272482164986435, "grad_norm": 15.429457664489746, "learning_rate": 4.4456831378274515e-05, "loss": 2.9904, "step": 538500 }, { "epoch": 1.128294872224269, "grad_norm": 15.69625186920166, "learning_rate": 4.445158712178422e-05, "loss": 3.0231, "step": 539000 }, { "epoch": 1.1293415279498946, "grad_norm": 25.32979965209961, "learning_rate": 4.444634286529393e-05, "loss": 3.0081, "step": 539500 }, { "epoch": 1.13038818367552, "grad_norm": 15.343337059020996, "learning_rate": 4.444109860880364e-05, "loss": 2.9931, "step": 540000 }, { "epoch": 1.1314348394011455, "grad_norm": 11.745262145996094, "learning_rate": 4.443585435231335e-05, "loss": 3.0169, "step": 540500 }, { "epoch": 1.132481495126771, "grad_norm": 14.267667770385742, "learning_rate": 4.443061009582306e-05, "loss": 2.9967, "step": 541000 }, { "epoch": 1.1335281508523964, "grad_norm": 28.496227264404297, "learning_rate": 4.442536583933277e-05, "loss": 2.9873, "step": 541500 }, { "epoch": 1.134574806578022, "grad_norm": 16.68963623046875, "learning_rate": 4.442012158284247e-05, "loss": 3.0096, "step": 542000 }, { "epoch": 1.1356214623036474, "grad_norm": 14.41950511932373, "learning_rate": 4.4414877326352183e-05, "loss": 2.9946, "step": 542500 }, { "epoch": 1.1366681180292728, "grad_norm": 19.147865295410156, "learning_rate": 4.440963306986189e-05, "loss": 2.9877, "step": 543000 }, { "epoch": 1.1377147737548983, "grad_norm": 17.30901336669922, "learning_rate": 4.44043888133716e-05, "loss": 3.0106, "step": 543500 }, { "epoch": 1.1387614294805237, "grad_norm": 14.750303268432617, "learning_rate": 4.43991445568813e-05, "loss": 3.0113, "step": 544000 }, { "epoch": 1.1398080852061494, "grad_norm": 13.227818489074707, "learning_rate": 4.4393900300391014e-05, "loss": 3.0062, "step": 544500 }, { "epoch": 1.1408547409317749, "grad_norm": 13.836867332458496, "learning_rate": 4.4388656043900725e-05, "loss": 2.9822, "step": 545000 }, { "epoch": 1.1419013966574003, "grad_norm": 12.408637046813965, "learning_rate": 4.438341178741043e-05, "loss": 3.001, "step": 545500 }, { "epoch": 1.1429480523830258, "grad_norm": 16.042613983154297, "learning_rate": 4.437816753092014e-05, "loss": 2.9869, "step": 546000 }, { "epoch": 1.1439947081086512, "grad_norm": 12.85754680633545, "learning_rate": 4.4372923274429845e-05, "loss": 2.9784, "step": 546500 }, { "epoch": 1.1450413638342767, "grad_norm": 12.984729766845703, "learning_rate": 4.4367679017939556e-05, "loss": 2.9978, "step": 547000 }, { "epoch": 1.1460880195599021, "grad_norm": 16.3748779296875, "learning_rate": 4.436243476144926e-05, "loss": 2.9633, "step": 547500 }, { "epoch": 1.1471346752855276, "grad_norm": 16.574893951416016, "learning_rate": 4.435719050495897e-05, "loss": 2.9956, "step": 548000 }, { "epoch": 1.1481813310111533, "grad_norm": 18.281940460205078, "learning_rate": 4.4351946248468676e-05, "loss": 2.9775, "step": 548500 }, { "epoch": 1.1492279867367787, "grad_norm": 16.105772018432617, "learning_rate": 4.434670199197839e-05, "loss": 2.9949, "step": 549000 }, { "epoch": 1.1502746424624042, "grad_norm": 15.92091178894043, "learning_rate": 4.43414577354881e-05, "loss": 2.9797, "step": 549500 }, { "epoch": 1.1513212981880296, "grad_norm": 15.19692611694336, "learning_rate": 4.43362134789978e-05, "loss": 2.977, "step": 550000 }, { "epoch": 1.152367953913655, "grad_norm": 18.256595611572266, "learning_rate": 4.433096922250751e-05, "loss": 2.9773, "step": 550500 }, { "epoch": 1.1534146096392806, "grad_norm": 18.08394432067871, "learning_rate": 4.432572496601722e-05, "loss": 2.9965, "step": 551000 }, { "epoch": 1.154461265364906, "grad_norm": 13.749302864074707, "learning_rate": 4.432048070952693e-05, "loss": 2.9971, "step": 551500 }, { "epoch": 1.1555079210905315, "grad_norm": 18.032032012939453, "learning_rate": 4.431523645303663e-05, "loss": 2.9945, "step": 552000 }, { "epoch": 1.156554576816157, "grad_norm": 26.1346492767334, "learning_rate": 4.4309992196546344e-05, "loss": 3.0053, "step": 552500 }, { "epoch": 1.1576012325417824, "grad_norm": 17.15296173095703, "learning_rate": 4.430474794005605e-05, "loss": 3.0059, "step": 553000 }, { "epoch": 1.158647888267408, "grad_norm": 12.93362045288086, "learning_rate": 4.4299503683565766e-05, "loss": 2.9863, "step": 553500 }, { "epoch": 1.1596945439930335, "grad_norm": 13.018702507019043, "learning_rate": 4.429425942707547e-05, "loss": 2.9797, "step": 554000 }, { "epoch": 1.160741199718659, "grad_norm": 16.219810485839844, "learning_rate": 4.428901517058518e-05, "loss": 2.9839, "step": 554500 }, { "epoch": 1.1617878554442844, "grad_norm": 13.489054679870605, "learning_rate": 4.4283770914094886e-05, "loss": 2.9873, "step": 555000 }, { "epoch": 1.1628345111699099, "grad_norm": 13.048511505126953, "learning_rate": 4.427852665760459e-05, "loss": 2.9821, "step": 555500 }, { "epoch": 1.1638811668955353, "grad_norm": 13.231034278869629, "learning_rate": 4.42732824011143e-05, "loss": 2.9853, "step": 556000 }, { "epoch": 1.1649278226211608, "grad_norm": 11.30261516571045, "learning_rate": 4.4268038144624006e-05, "loss": 2.9876, "step": 556500 }, { "epoch": 1.1659744783467865, "grad_norm": 11.621756553649902, "learning_rate": 4.426279388813372e-05, "loss": 3.0014, "step": 557000 }, { "epoch": 1.167021134072412, "grad_norm": 11.884033203125, "learning_rate": 4.425754963164343e-05, "loss": 2.9736, "step": 557500 }, { "epoch": 1.1680677897980374, "grad_norm": 13.278491973876953, "learning_rate": 4.425230537515314e-05, "loss": 2.9827, "step": 558000 }, { "epoch": 1.1691144455236628, "grad_norm": 17.669145584106445, "learning_rate": 4.424706111866284e-05, "loss": 2.9671, "step": 558500 }, { "epoch": 1.1701611012492883, "grad_norm": 14.084568977355957, "learning_rate": 4.4241816862172554e-05, "loss": 2.9884, "step": 559000 }, { "epoch": 1.1712077569749137, "grad_norm": 14.912100791931152, "learning_rate": 4.423657260568226e-05, "loss": 2.9876, "step": 559500 }, { "epoch": 1.1722544127005392, "grad_norm": 24.146312713623047, "learning_rate": 4.423132834919197e-05, "loss": 2.9609, "step": 560000 }, { "epoch": 1.1733010684261647, "grad_norm": 11.574045181274414, "learning_rate": 4.4226084092701674e-05, "loss": 2.9738, "step": 560500 }, { "epoch": 1.1743477241517901, "grad_norm": 13.705992698669434, "learning_rate": 4.422083983621138e-05, "loss": 2.9728, "step": 561000 }, { "epoch": 1.1753943798774156, "grad_norm": 17.499820709228516, "learning_rate": 4.421559557972109e-05, "loss": 2.9538, "step": 561500 }, { "epoch": 1.1764410356030413, "grad_norm": 15.150625228881836, "learning_rate": 4.42103513232308e-05, "loss": 2.9766, "step": 562000 }, { "epoch": 1.1774876913286667, "grad_norm": 15.392008781433105, "learning_rate": 4.420510706674051e-05, "loss": 2.9721, "step": 562500 }, { "epoch": 1.1785343470542922, "grad_norm": 17.179031372070312, "learning_rate": 4.4199862810250216e-05, "loss": 2.9861, "step": 563000 }, { "epoch": 1.1795810027799176, "grad_norm": 18.91165542602539, "learning_rate": 4.419461855375993e-05, "loss": 2.9859, "step": 563500 }, { "epoch": 1.180627658505543, "grad_norm": 13.514496803283691, "learning_rate": 4.418937429726963e-05, "loss": 2.955, "step": 564000 }, { "epoch": 1.1816743142311685, "grad_norm": 20.33497428894043, "learning_rate": 4.418413004077934e-05, "loss": 2.992, "step": 564500 }, { "epoch": 1.182720969956794, "grad_norm": 12.73275375366211, "learning_rate": 4.4178885784289047e-05, "loss": 2.9936, "step": 565000 }, { "epoch": 1.1837676256824194, "grad_norm": 12.380844116210938, "learning_rate": 4.417364152779876e-05, "loss": 2.9781, "step": 565500 }, { "epoch": 1.1848142814080451, "grad_norm": 14.160115242004395, "learning_rate": 4.416839727130846e-05, "loss": 2.9662, "step": 566000 }, { "epoch": 1.1858609371336706, "grad_norm": 13.377715110778809, "learning_rate": 4.416315301481817e-05, "loss": 2.9802, "step": 566500 }, { "epoch": 1.186907592859296, "grad_norm": 14.054967880249023, "learning_rate": 4.4157908758327884e-05, "loss": 2.9843, "step": 567000 }, { "epoch": 1.1879542485849215, "grad_norm": 16.117753982543945, "learning_rate": 4.415266450183759e-05, "loss": 2.985, "step": 567500 }, { "epoch": 1.189000904310547, "grad_norm": 28.80914306640625, "learning_rate": 4.41474202453473e-05, "loss": 2.9537, "step": 568000 }, { "epoch": 1.1900475600361724, "grad_norm": 17.86271858215332, "learning_rate": 4.4142175988857004e-05, "loss": 2.968, "step": 568500 }, { "epoch": 1.1910942157617979, "grad_norm": 12.040626525878906, "learning_rate": 4.4136931732366715e-05, "loss": 2.9717, "step": 569000 }, { "epoch": 1.1921408714874233, "grad_norm": 60.07307434082031, "learning_rate": 4.413168747587642e-05, "loss": 2.9522, "step": 569500 }, { "epoch": 1.1931875272130488, "grad_norm": 12.939506530761719, "learning_rate": 4.412644321938613e-05, "loss": 2.9699, "step": 570000 }, { "epoch": 1.1942341829386742, "grad_norm": 12.909160614013672, "learning_rate": 4.4121198962895834e-05, "loss": 2.9549, "step": 570500 }, { "epoch": 1.1952808386643, "grad_norm": 14.894948959350586, "learning_rate": 4.4115954706405546e-05, "loss": 2.9664, "step": 571000 }, { "epoch": 1.1963274943899254, "grad_norm": 11.685604095458984, "learning_rate": 4.411071044991526e-05, "loss": 2.9657, "step": 571500 }, { "epoch": 1.1973741501155508, "grad_norm": 15.292893409729004, "learning_rate": 4.410546619342497e-05, "loss": 2.9689, "step": 572000 }, { "epoch": 1.1984208058411763, "grad_norm": 32.55264663696289, "learning_rate": 4.410022193693467e-05, "loss": 2.9625, "step": 572500 }, { "epoch": 1.1994674615668017, "grad_norm": 13.89078140258789, "learning_rate": 4.4094977680444376e-05, "loss": 2.979, "step": 573000 }, { "epoch": 1.2005141172924272, "grad_norm": 194.86131286621094, "learning_rate": 4.408973342395409e-05, "loss": 2.9577, "step": 573500 }, { "epoch": 1.2015607730180526, "grad_norm": 23.70923614501953, "learning_rate": 4.408448916746379e-05, "loss": 2.9792, "step": 574000 }, { "epoch": 1.2026074287436783, "grad_norm": 14.962553024291992, "learning_rate": 4.40792449109735e-05, "loss": 2.9661, "step": 574500 }, { "epoch": 1.2036540844693038, "grad_norm": 14.002620697021484, "learning_rate": 4.4074000654483214e-05, "loss": 2.9732, "step": 575000 }, { "epoch": 1.2047007401949292, "grad_norm": 13.932755470275879, "learning_rate": 4.4068756397992925e-05, "loss": 2.9709, "step": 575500 }, { "epoch": 1.2057473959205547, "grad_norm": 21.584552764892578, "learning_rate": 4.406351214150263e-05, "loss": 2.9723, "step": 576000 }, { "epoch": 1.2067940516461801, "grad_norm": 12.36601734161377, "learning_rate": 4.405826788501234e-05, "loss": 2.9654, "step": 576500 }, { "epoch": 1.2078407073718056, "grad_norm": 14.37961483001709, "learning_rate": 4.4053023628522045e-05, "loss": 2.9533, "step": 577000 }, { "epoch": 1.208887363097431, "grad_norm": 29.758607864379883, "learning_rate": 4.4047779372031756e-05, "loss": 2.9512, "step": 577500 }, { "epoch": 1.2099340188230565, "grad_norm": 14.40434455871582, "learning_rate": 4.404253511554146e-05, "loss": 2.9625, "step": 578000 }, { "epoch": 1.210980674548682, "grad_norm": 19.294954299926758, "learning_rate": 4.4037290859051164e-05, "loss": 2.9391, "step": 578500 }, { "epoch": 1.2120273302743074, "grad_norm": 11.44865894317627, "learning_rate": 4.4032046602560875e-05, "loss": 2.9771, "step": 579000 }, { "epoch": 1.213073985999933, "grad_norm": 16.60190773010254, "learning_rate": 4.4026802346070586e-05, "loss": 2.952, "step": 579500 }, { "epoch": 1.2141206417255586, "grad_norm": 15.689961433410645, "learning_rate": 4.40215580895803e-05, "loss": 2.9526, "step": 580000 }, { "epoch": 1.215167297451184, "grad_norm": 12.488316535949707, "learning_rate": 4.401631383309e-05, "loss": 2.9506, "step": 580500 }, { "epoch": 1.2162139531768095, "grad_norm": 17.032251358032227, "learning_rate": 4.401106957659971e-05, "loss": 2.9481, "step": 581000 }, { "epoch": 1.217260608902435, "grad_norm": 12.376636505126953, "learning_rate": 4.400582532010942e-05, "loss": 2.9572, "step": 581500 }, { "epoch": 1.2183072646280604, "grad_norm": 14.647045135498047, "learning_rate": 4.400058106361913e-05, "loss": 2.9529, "step": 582000 }, { "epoch": 1.2193539203536858, "grad_norm": 16.646379470825195, "learning_rate": 4.399533680712883e-05, "loss": 2.9672, "step": 582500 }, { "epoch": 1.2204005760793113, "grad_norm": 38.438377380371094, "learning_rate": 4.3990092550638544e-05, "loss": 2.9428, "step": 583000 }, { "epoch": 1.221447231804937, "grad_norm": 12.809453964233398, "learning_rate": 4.398484829414825e-05, "loss": 2.99, "step": 583500 }, { "epoch": 1.2224938875305624, "grad_norm": 26.387304306030273, "learning_rate": 4.397960403765796e-05, "loss": 2.9688, "step": 584000 }, { "epoch": 1.2235405432561879, "grad_norm": 16.358354568481445, "learning_rate": 4.397435978116767e-05, "loss": 2.9464, "step": 584500 }, { "epoch": 1.2245871989818133, "grad_norm": 18.186038970947266, "learning_rate": 4.3969115524677374e-05, "loss": 2.9248, "step": 585000 }, { "epoch": 1.2256338547074388, "grad_norm": 14.480525970458984, "learning_rate": 4.3963871268187086e-05, "loss": 2.9473, "step": 585500 }, { "epoch": 1.2266805104330643, "grad_norm": 15.298192977905273, "learning_rate": 4.395862701169679e-05, "loss": 2.972, "step": 586000 }, { "epoch": 1.2277271661586897, "grad_norm": 22.319883346557617, "learning_rate": 4.39533827552065e-05, "loss": 2.9505, "step": 586500 }, { "epoch": 1.2287738218843152, "grad_norm": 14.511069297790527, "learning_rate": 4.3948138498716205e-05, "loss": 2.9511, "step": 587000 }, { "epoch": 1.2298204776099406, "grad_norm": 13.26965045928955, "learning_rate": 4.3942894242225916e-05, "loss": 2.9455, "step": 587500 }, { "epoch": 1.230867133335566, "grad_norm": 20.28490447998047, "learning_rate": 4.393764998573562e-05, "loss": 2.953, "step": 588000 }, { "epoch": 1.2319137890611918, "grad_norm": 12.960942268371582, "learning_rate": 4.393240572924533e-05, "loss": 2.9524, "step": 588500 }, { "epoch": 1.2329604447868172, "grad_norm": 15.80163288116455, "learning_rate": 4.392716147275504e-05, "loss": 2.9652, "step": 589000 }, { "epoch": 1.2340071005124427, "grad_norm": 14.4727144241333, "learning_rate": 4.392191721626475e-05, "loss": 2.952, "step": 589500 }, { "epoch": 1.2350537562380681, "grad_norm": 13.895008087158203, "learning_rate": 4.391667295977446e-05, "loss": 2.9525, "step": 590000 }, { "epoch": 1.2361004119636936, "grad_norm": 10.901680946350098, "learning_rate": 4.391142870328416e-05, "loss": 2.9683, "step": 590500 }, { "epoch": 1.237147067689319, "grad_norm": 14.707724571228027, "learning_rate": 4.3906184446793874e-05, "loss": 2.965, "step": 591000 }, { "epoch": 1.2381937234149445, "grad_norm": 13.017621040344238, "learning_rate": 4.390094019030358e-05, "loss": 2.9441, "step": 591500 }, { "epoch": 1.2392403791405702, "grad_norm": 14.536526679992676, "learning_rate": 4.389569593381329e-05, "loss": 2.9341, "step": 592000 }, { "epoch": 1.2402870348661956, "grad_norm": 23.04094123840332, "learning_rate": 4.3890451677323e-05, "loss": 2.9536, "step": 592500 }, { "epoch": 1.241333690591821, "grad_norm": 23.89357566833496, "learning_rate": 4.388520742083271e-05, "loss": 2.9493, "step": 593000 }, { "epoch": 1.2423803463174465, "grad_norm": 15.528773307800293, "learning_rate": 4.3879963164342415e-05, "loss": 2.9639, "step": 593500 }, { "epoch": 1.243427002043072, "grad_norm": 37.36931610107422, "learning_rate": 4.3874718907852126e-05, "loss": 2.9447, "step": 594000 }, { "epoch": 1.2444736577686974, "grad_norm": 17.50502586364746, "learning_rate": 4.386947465136183e-05, "loss": 2.932, "step": 594500 }, { "epoch": 1.245520313494323, "grad_norm": 17.634275436401367, "learning_rate": 4.3864230394871535e-05, "loss": 2.9643, "step": 595000 }, { "epoch": 1.2465669692199484, "grad_norm": 20.360092163085938, "learning_rate": 4.3858986138381246e-05, "loss": 2.9441, "step": 595500 }, { "epoch": 1.2476136249455738, "grad_norm": 16.517004013061523, "learning_rate": 4.385374188189095e-05, "loss": 2.9619, "step": 596000 }, { "epoch": 1.2486602806711993, "grad_norm": 13.523192405700684, "learning_rate": 4.384849762540066e-05, "loss": 2.9308, "step": 596500 }, { "epoch": 1.249706936396825, "grad_norm": 21.741304397583008, "learning_rate": 4.384325336891037e-05, "loss": 2.9383, "step": 597000 }, { "epoch": 1.2507535921224504, "grad_norm": 22.07512092590332, "learning_rate": 4.3838009112420084e-05, "loss": 2.9371, "step": 597500 }, { "epoch": 1.2518002478480759, "grad_norm": 15.000121116638184, "learning_rate": 4.383276485592979e-05, "loss": 2.9307, "step": 598000 }, { "epoch": 1.2528469035737013, "grad_norm": 14.456720352172852, "learning_rate": 4.38275205994395e-05, "loss": 2.9656, "step": 598500 }, { "epoch": 1.2538935592993268, "grad_norm": 20.181062698364258, "learning_rate": 4.3822276342949203e-05, "loss": 2.9407, "step": 599000 }, { "epoch": 1.2549402150249522, "grad_norm": 30.429250717163086, "learning_rate": 4.3817032086458914e-05, "loss": 2.9461, "step": 599500 }, { "epoch": 1.2559868707505777, "grad_norm": 13.882564544677734, "learning_rate": 4.381178782996862e-05, "loss": 2.9574, "step": 600000 }, { "epoch": 1.2570335264762034, "grad_norm": 57.829959869384766, "learning_rate": 4.380654357347833e-05, "loss": 2.9448, "step": 600500 }, { "epoch": 1.2580801822018288, "grad_norm": 14.700471878051758, "learning_rate": 4.3801299316988034e-05, "loss": 2.9653, "step": 601000 }, { "epoch": 1.2591268379274543, "grad_norm": 12.941654205322266, "learning_rate": 4.3796055060497745e-05, "loss": 2.9368, "step": 601500 }, { "epoch": 1.2601734936530797, "grad_norm": 15.00155258178711, "learning_rate": 4.3790810804007456e-05, "loss": 2.9616, "step": 602000 }, { "epoch": 1.2612201493787052, "grad_norm": 32.016502380371094, "learning_rate": 4.378556654751716e-05, "loss": 2.9647, "step": 602500 }, { "epoch": 1.2622668051043306, "grad_norm": 12.755894660949707, "learning_rate": 4.378032229102687e-05, "loss": 2.9773, "step": 603000 }, { "epoch": 1.263313460829956, "grad_norm": 17.50634765625, "learning_rate": 4.3775078034536576e-05, "loss": 2.9654, "step": 603500 }, { "epoch": 1.2643601165555816, "grad_norm": 25.208904266357422, "learning_rate": 4.376983377804629e-05, "loss": 2.9425, "step": 604000 }, { "epoch": 1.265406772281207, "grad_norm": 15.632295608520508, "learning_rate": 4.376458952155599e-05, "loss": 2.95, "step": 604500 }, { "epoch": 1.2664534280068325, "grad_norm": 13.209038734436035, "learning_rate": 4.37593452650657e-05, "loss": 2.9515, "step": 605000 }, { "epoch": 1.267500083732458, "grad_norm": 15.47280216217041, "learning_rate": 4.375410100857541e-05, "loss": 2.9725, "step": 605500 }, { "epoch": 1.2685467394580836, "grad_norm": 14.301170349121094, "learning_rate": 4.374885675208512e-05, "loss": 2.9529, "step": 606000 }, { "epoch": 1.269593395183709, "grad_norm": 15.765741348266602, "learning_rate": 4.374361249559483e-05, "loss": 2.9583, "step": 606500 }, { "epoch": 1.2706400509093345, "grad_norm": 24.335847854614258, "learning_rate": 4.373836823910453e-05, "loss": 2.9466, "step": 607000 }, { "epoch": 1.27168670663496, "grad_norm": 17.440401077270508, "learning_rate": 4.3733123982614244e-05, "loss": 2.9483, "step": 607500 }, { "epoch": 1.2727333623605854, "grad_norm": 17.00119972229004, "learning_rate": 4.372787972612395e-05, "loss": 2.9734, "step": 608000 }, { "epoch": 1.2737800180862109, "grad_norm": 16.257761001586914, "learning_rate": 4.372263546963366e-05, "loss": 2.9581, "step": 608500 }, { "epoch": 1.2748266738118363, "grad_norm": 11.97037410736084, "learning_rate": 4.3717391213143364e-05, "loss": 2.9596, "step": 609000 }, { "epoch": 1.275873329537462, "grad_norm": 17.396650314331055, "learning_rate": 4.3712146956653075e-05, "loss": 2.9546, "step": 609500 }, { "epoch": 1.2769199852630875, "grad_norm": 23.70952033996582, "learning_rate": 4.3706902700162786e-05, "loss": 2.9435, "step": 610000 }, { "epoch": 1.277966640988713, "grad_norm": 15.47902774810791, "learning_rate": 4.37016584436725e-05, "loss": 2.9284, "step": 610500 }, { "epoch": 1.2790132967143384, "grad_norm": 16.46710205078125, "learning_rate": 4.36964141871822e-05, "loss": 2.96, "step": 611000 }, { "epoch": 1.2800599524399638, "grad_norm": 13.091629981994629, "learning_rate": 4.369116993069191e-05, "loss": 2.9584, "step": 611500 }, { "epoch": 1.2811066081655893, "grad_norm": 11.74296760559082, "learning_rate": 4.368592567420162e-05, "loss": 2.9382, "step": 612000 }, { "epoch": 1.2821532638912148, "grad_norm": 13.381237030029297, "learning_rate": 4.368068141771132e-05, "loss": 2.9671, "step": 612500 }, { "epoch": 1.2831999196168402, "grad_norm": 21.42831802368164, "learning_rate": 4.367543716122103e-05, "loss": 2.9566, "step": 613000 }, { "epoch": 1.2842465753424657, "grad_norm": 12.76009464263916, "learning_rate": 4.3670192904730737e-05, "loss": 2.9495, "step": 613500 }, { "epoch": 1.2852932310680911, "grad_norm": 24.794614791870117, "learning_rate": 4.366494864824045e-05, "loss": 2.9488, "step": 614000 }, { "epoch": 1.2863398867937166, "grad_norm": 15.808890342712402, "learning_rate": 4.365970439175016e-05, "loss": 2.9649, "step": 614500 }, { "epoch": 1.2873865425193423, "grad_norm": 13.825139045715332, "learning_rate": 4.365446013525987e-05, "loss": 2.9277, "step": 615000 }, { "epoch": 1.2884331982449677, "grad_norm": 13.459590911865234, "learning_rate": 4.3649215878769574e-05, "loss": 2.9312, "step": 615500 }, { "epoch": 1.2894798539705932, "grad_norm": 19.3787784576416, "learning_rate": 4.3643971622279285e-05, "loss": 2.9433, "step": 616000 }, { "epoch": 1.2905265096962186, "grad_norm": 24.596731185913086, "learning_rate": 4.363872736578899e-05, "loss": 2.9415, "step": 616500 }, { "epoch": 1.291573165421844, "grad_norm": 31.661701202392578, "learning_rate": 4.36334831092987e-05, "loss": 2.9404, "step": 617000 }, { "epoch": 1.2926198211474695, "grad_norm": 32.71586608886719, "learning_rate": 4.3628238852808405e-05, "loss": 2.9693, "step": 617500 }, { "epoch": 1.2936664768730952, "grad_norm": 35.92750930786133, "learning_rate": 4.362299459631811e-05, "loss": 2.9426, "step": 618000 }, { "epoch": 1.2947131325987207, "grad_norm": 21.19460105895996, "learning_rate": 4.361775033982782e-05, "loss": 2.9677, "step": 618500 }, { "epoch": 1.2957597883243461, "grad_norm": 21.371219635009766, "learning_rate": 4.361250608333753e-05, "loss": 2.9386, "step": 619000 }, { "epoch": 1.2968064440499716, "grad_norm": 12.552831649780273, "learning_rate": 4.360726182684724e-05, "loss": 2.9653, "step": 619500 }, { "epoch": 1.297853099775597, "grad_norm": 15.034661293029785, "learning_rate": 4.360201757035695e-05, "loss": 2.9559, "step": 620000 }, { "epoch": 1.2988997555012225, "grad_norm": 22.290494918823242, "learning_rate": 4.359677331386666e-05, "loss": 2.9571, "step": 620500 }, { "epoch": 1.299946411226848, "grad_norm": 16.985605239868164, "learning_rate": 4.359152905737636e-05, "loss": 2.9505, "step": 621000 }, { "epoch": 1.3009930669524734, "grad_norm": 22.134254455566406, "learning_rate": 4.358628480088607e-05, "loss": 2.9423, "step": 621500 }, { "epoch": 1.3020397226780989, "grad_norm": 69.7936019897461, "learning_rate": 4.358104054439578e-05, "loss": 2.9113, "step": 622000 }, { "epoch": 1.3030863784037243, "grad_norm": 40.80719757080078, "learning_rate": 4.357579628790549e-05, "loss": 2.9234, "step": 622500 }, { "epoch": 1.3041330341293498, "grad_norm": 27.76987648010254, "learning_rate": 4.357055203141519e-05, "loss": 2.9545, "step": 623000 }, { "epoch": 1.3051796898549755, "grad_norm": 19.44582176208496, "learning_rate": 4.3565307774924904e-05, "loss": 2.9447, "step": 623500 }, { "epoch": 1.306226345580601, "grad_norm": 31.896249771118164, "learning_rate": 4.3560063518434615e-05, "loss": 2.9567, "step": 624000 }, { "epoch": 1.3072730013062264, "grad_norm": 21.75132179260254, "learning_rate": 4.355481926194432e-05, "loss": 2.9426, "step": 624500 }, { "epoch": 1.3083196570318518, "grad_norm": 16.599910736083984, "learning_rate": 4.354957500545403e-05, "loss": 2.934, "step": 625000 }, { "epoch": 1.3093663127574773, "grad_norm": 90.77991485595703, "learning_rate": 4.3544330748963735e-05, "loss": 2.9347, "step": 625500 }, { "epoch": 1.3104129684831027, "grad_norm": 14.111647605895996, "learning_rate": 4.3539086492473446e-05, "loss": 2.9367, "step": 626000 }, { "epoch": 1.3114596242087282, "grad_norm": 59.966766357421875, "learning_rate": 4.353384223598315e-05, "loss": 2.9404, "step": 626500 }, { "epoch": 1.3125062799343539, "grad_norm": 64.15888977050781, "learning_rate": 4.352859797949286e-05, "loss": 2.9546, "step": 627000 }, { "epoch": 1.3135529356599793, "grad_norm": 15.227395057678223, "learning_rate": 4.352335372300257e-05, "loss": 2.948, "step": 627500 }, { "epoch": 1.3145995913856048, "grad_norm": 22.66758155822754, "learning_rate": 4.351810946651228e-05, "loss": 2.9572, "step": 628000 }, { "epoch": 1.3156462471112302, "grad_norm": 35.623165130615234, "learning_rate": 4.351286521002199e-05, "loss": 2.9487, "step": 628500 }, { "epoch": 1.3166929028368557, "grad_norm": 23.488719940185547, "learning_rate": 4.350762095353169e-05, "loss": 2.9481, "step": 629000 }, { "epoch": 1.3177395585624812, "grad_norm": 40.649391174316406, "learning_rate": 4.35023766970414e-05, "loss": 2.9436, "step": 629500 }, { "epoch": 1.3187862142881066, "grad_norm": 13.477127075195312, "learning_rate": 4.349713244055111e-05, "loss": 2.975, "step": 630000 }, { "epoch": 1.319832870013732, "grad_norm": 12.91096305847168, "learning_rate": 4.349188818406082e-05, "loss": 2.9499, "step": 630500 }, { "epoch": 1.3208795257393575, "grad_norm": 14.900449752807617, "learning_rate": 4.348664392757052e-05, "loss": 2.9619, "step": 631000 }, { "epoch": 1.321926181464983, "grad_norm": 13.177579879760742, "learning_rate": 4.3481399671080234e-05, "loss": 2.9465, "step": 631500 }, { "epoch": 1.3229728371906084, "grad_norm": 21.62117576599121, "learning_rate": 4.3476155414589945e-05, "loss": 2.9468, "step": 632000 }, { "epoch": 1.324019492916234, "grad_norm": 88.81108856201172, "learning_rate": 4.3470911158099656e-05, "loss": 2.9427, "step": 632500 }, { "epoch": 1.3250661486418596, "grad_norm": 20.201501846313477, "learning_rate": 4.346566690160936e-05, "loss": 2.9432, "step": 633000 }, { "epoch": 1.326112804367485, "grad_norm": 55.61350631713867, "learning_rate": 4.346042264511907e-05, "loss": 2.942, "step": 633500 }, { "epoch": 1.3271594600931105, "grad_norm": 19.59898567199707, "learning_rate": 4.3455178388628776e-05, "loss": 2.9421, "step": 634000 }, { "epoch": 1.328206115818736, "grad_norm": 129.44447326660156, "learning_rate": 4.344993413213849e-05, "loss": 2.9372, "step": 634500 }, { "epoch": 1.3292527715443614, "grad_norm": 18.181604385375977, "learning_rate": 4.344468987564819e-05, "loss": 2.946, "step": 635000 }, { "epoch": 1.330299427269987, "grad_norm": 60.080970764160156, "learning_rate": 4.3439445619157895e-05, "loss": 2.9103, "step": 635500 }, { "epoch": 1.3313460829956125, "grad_norm": 19.467023849487305, "learning_rate": 4.3434201362667606e-05, "loss": 2.9283, "step": 636000 }, { "epoch": 1.332392738721238, "grad_norm": 20.5722713470459, "learning_rate": 4.342895710617732e-05, "loss": 2.9394, "step": 636500 }, { "epoch": 1.3334393944468634, "grad_norm": 16.186805725097656, "learning_rate": 4.342371284968703e-05, "loss": 2.9461, "step": 637000 }, { "epoch": 1.334486050172489, "grad_norm": 21.504770278930664, "learning_rate": 4.341846859319673e-05, "loss": 2.9631, "step": 637500 }, { "epoch": 1.3355327058981143, "grad_norm": 16.587635040283203, "learning_rate": 4.3413224336706444e-05, "loss": 2.9659, "step": 638000 }, { "epoch": 1.3365793616237398, "grad_norm": 34.4616584777832, "learning_rate": 4.340798008021615e-05, "loss": 2.9812, "step": 638500 }, { "epoch": 1.3376260173493653, "grad_norm": 16.15749168395996, "learning_rate": 4.340273582372586e-05, "loss": 2.945, "step": 639000 }, { "epoch": 1.3386726730749907, "grad_norm": 18.60582733154297, "learning_rate": 4.3397491567235564e-05, "loss": 2.9361, "step": 639500 }, { "epoch": 1.3397193288006162, "grad_norm": 35.563961029052734, "learning_rate": 4.3392247310745275e-05, "loss": 2.965, "step": 640000 }, { "epoch": 1.3407659845262416, "grad_norm": 23.264610290527344, "learning_rate": 4.338700305425498e-05, "loss": 2.9433, "step": 640500 }, { "epoch": 1.3418126402518673, "grad_norm": 14.169709205627441, "learning_rate": 4.338175879776469e-05, "loss": 2.9418, "step": 641000 }, { "epoch": 1.3428592959774928, "grad_norm": 19.01775550842285, "learning_rate": 4.33765145412744e-05, "loss": 2.9575, "step": 641500 }, { "epoch": 1.3439059517031182, "grad_norm": 16.287757873535156, "learning_rate": 4.3371270284784105e-05, "loss": 2.9456, "step": 642000 }, { "epoch": 1.3449526074287437, "grad_norm": 17.753149032592773, "learning_rate": 4.3366026028293817e-05, "loss": 2.9543, "step": 642500 }, { "epoch": 1.3459992631543691, "grad_norm": 22.021379470825195, "learning_rate": 4.336078177180352e-05, "loss": 2.9574, "step": 643000 }, { "epoch": 1.3470459188799946, "grad_norm": 21.503376007080078, "learning_rate": 4.335553751531323e-05, "loss": 2.9647, "step": 643500 }, { "epoch": 1.34809257460562, "grad_norm": 17.9859561920166, "learning_rate": 4.3350293258822936e-05, "loss": 2.9562, "step": 644000 }, { "epoch": 1.3491392303312457, "grad_norm": 24.51833152770996, "learning_rate": 4.334504900233265e-05, "loss": 2.9634, "step": 644500 }, { "epoch": 1.3501858860568712, "grad_norm": 214.4727325439453, "learning_rate": 4.333980474584236e-05, "loss": 2.9722, "step": 645000 }, { "epoch": 1.3512325417824966, "grad_norm": 13.428659439086914, "learning_rate": 4.333456048935207e-05, "loss": 2.9491, "step": 645500 }, { "epoch": 1.352279197508122, "grad_norm": 12.121609687805176, "learning_rate": 4.3329316232861774e-05, "loss": 2.9337, "step": 646000 }, { "epoch": 1.3533258532337475, "grad_norm": 15.777915000915527, "learning_rate": 4.332407197637148e-05, "loss": 2.9449, "step": 646500 }, { "epoch": 1.354372508959373, "grad_norm": 13.55416202545166, "learning_rate": 4.331882771988119e-05, "loss": 2.9566, "step": 647000 }, { "epoch": 1.3554191646849985, "grad_norm": 29.739959716796875, "learning_rate": 4.3313583463390893e-05, "loss": 2.9552, "step": 647500 }, { "epoch": 1.356465820410624, "grad_norm": 16.770851135253906, "learning_rate": 4.3308339206900605e-05, "loss": 2.9494, "step": 648000 }, { "epoch": 1.3575124761362494, "grad_norm": 16.43889045715332, "learning_rate": 4.330309495041031e-05, "loss": 2.9608, "step": 648500 }, { "epoch": 1.3585591318618748, "grad_norm": 17.451663970947266, "learning_rate": 4.329785069392002e-05, "loss": 2.9403, "step": 649000 }, { "epoch": 1.3596057875875003, "grad_norm": 29.002647399902344, "learning_rate": 4.329260643742973e-05, "loss": 2.9464, "step": 649500 }, { "epoch": 1.360652443313126, "grad_norm": 15.871525764465332, "learning_rate": 4.328736218093944e-05, "loss": 2.9232, "step": 650000 }, { "epoch": 1.3616990990387514, "grad_norm": 14.790121078491211, "learning_rate": 4.3282117924449146e-05, "loss": 2.9467, "step": 650500 }, { "epoch": 1.3627457547643769, "grad_norm": 18.181758880615234, "learning_rate": 4.327687366795886e-05, "loss": 2.9479, "step": 651000 }, { "epoch": 1.3637924104900023, "grad_norm": 25.732467651367188, "learning_rate": 4.327162941146856e-05, "loss": 2.9512, "step": 651500 }, { "epoch": 1.3648390662156278, "grad_norm": 14.007415771484375, "learning_rate": 4.3266385154978266e-05, "loss": 2.948, "step": 652000 }, { "epoch": 1.3658857219412532, "grad_norm": 11.45348072052002, "learning_rate": 4.326114089848798e-05, "loss": 2.9213, "step": 652500 }, { "epoch": 1.366932377666879, "grad_norm": 21.92885971069336, "learning_rate": 4.325589664199768e-05, "loss": 2.9288, "step": 653000 }, { "epoch": 1.3679790333925044, "grad_norm": 19.603214263916016, "learning_rate": 4.325065238550739e-05, "loss": 2.9399, "step": 653500 }, { "epoch": 1.3690256891181298, "grad_norm": 31.02138328552246, "learning_rate": 4.3245408129017104e-05, "loss": 2.9335, "step": 654000 }, { "epoch": 1.3700723448437553, "grad_norm": 41.87757110595703, "learning_rate": 4.3240163872526815e-05, "loss": 2.9651, "step": 654500 }, { "epoch": 1.3711190005693807, "grad_norm": 35.32951736450195, "learning_rate": 4.323491961603652e-05, "loss": 2.9468, "step": 655000 }, { "epoch": 1.3721656562950062, "grad_norm": 81.44174194335938, "learning_rate": 4.322967535954623e-05, "loss": 2.9465, "step": 655500 }, { "epoch": 1.3732123120206317, "grad_norm": 43.93706130981445, "learning_rate": 4.3224431103055934e-05, "loss": 2.9522, "step": 656000 }, { "epoch": 1.374258967746257, "grad_norm": 24.791166305541992, "learning_rate": 4.3219186846565645e-05, "loss": 2.9583, "step": 656500 }, { "epoch": 1.3753056234718826, "grad_norm": 13.720112800598145, "learning_rate": 4.321394259007535e-05, "loss": 2.9264, "step": 657000 }, { "epoch": 1.376352279197508, "grad_norm": 20.103694915771484, "learning_rate": 4.3208698333585054e-05, "loss": 2.9465, "step": 657500 }, { "epoch": 1.3773989349231335, "grad_norm": 24.260622024536133, "learning_rate": 4.3203454077094765e-05, "loss": 2.9474, "step": 658000 }, { "epoch": 1.3784455906487592, "grad_norm": 15.010303497314453, "learning_rate": 4.3198209820604476e-05, "loss": 2.9354, "step": 658500 }, { "epoch": 1.3794922463743846, "grad_norm": 21.447298049926758, "learning_rate": 4.319296556411419e-05, "loss": 2.9542, "step": 659000 }, { "epoch": 1.38053890210001, "grad_norm": 46.949283599853516, "learning_rate": 4.318772130762389e-05, "loss": 2.9626, "step": 659500 }, { "epoch": 1.3815855578256355, "grad_norm": 16.035385131835938, "learning_rate": 4.31824770511336e-05, "loss": 2.9556, "step": 660000 }, { "epoch": 1.382632213551261, "grad_norm": 18.86653709411621, "learning_rate": 4.317723279464331e-05, "loss": 2.9439, "step": 660500 }, { "epoch": 1.3836788692768864, "grad_norm": 36.015838623046875, "learning_rate": 4.317198853815302e-05, "loss": 2.9367, "step": 661000 }, { "epoch": 1.384725525002512, "grad_norm": 17.04486656188965, "learning_rate": 4.316674428166272e-05, "loss": 2.9118, "step": 661500 }, { "epoch": 1.3857721807281376, "grad_norm": 19.173154830932617, "learning_rate": 4.3161500025172433e-05, "loss": 2.9604, "step": 662000 }, { "epoch": 1.386818836453763, "grad_norm": 15.832111358642578, "learning_rate": 4.315625576868214e-05, "loss": 2.9403, "step": 662500 }, { "epoch": 1.3878654921793885, "grad_norm": 13.961092948913574, "learning_rate": 4.315101151219185e-05, "loss": 2.9422, "step": 663000 }, { "epoch": 1.388912147905014, "grad_norm": 15.129354476928711, "learning_rate": 4.314576725570156e-05, "loss": 2.9465, "step": 663500 }, { "epoch": 1.3899588036306394, "grad_norm": 13.611313819885254, "learning_rate": 4.3140522999211264e-05, "loss": 2.915, "step": 664000 }, { "epoch": 1.3910054593562649, "grad_norm": 13.737316131591797, "learning_rate": 4.3135278742720975e-05, "loss": 2.9222, "step": 664500 }, { "epoch": 1.3920521150818903, "grad_norm": 12.163566589355469, "learning_rate": 4.313003448623068e-05, "loss": 2.9509, "step": 665000 }, { "epoch": 1.3930987708075158, "grad_norm": 24.514543533325195, "learning_rate": 4.312479022974039e-05, "loss": 2.9478, "step": 665500 }, { "epoch": 1.3941454265331412, "grad_norm": 14.250805854797363, "learning_rate": 4.3119545973250095e-05, "loss": 2.9396, "step": 666000 }, { "epoch": 1.3951920822587667, "grad_norm": 55.37478256225586, "learning_rate": 4.3114301716759806e-05, "loss": 2.9208, "step": 666500 }, { "epoch": 1.3962387379843924, "grad_norm": 13.781594276428223, "learning_rate": 4.310905746026952e-05, "loss": 2.9255, "step": 667000 }, { "epoch": 1.3972853937100178, "grad_norm": 13.536470413208008, "learning_rate": 4.310381320377923e-05, "loss": 2.9501, "step": 667500 }, { "epoch": 1.3983320494356433, "grad_norm": 15.029908180236816, "learning_rate": 4.309856894728893e-05, "loss": 2.9263, "step": 668000 }, { "epoch": 1.3993787051612687, "grad_norm": 13.003904342651367, "learning_rate": 4.3093324690798644e-05, "loss": 2.9369, "step": 668500 }, { "epoch": 1.4004253608868942, "grad_norm": 14.085989952087402, "learning_rate": 4.308808043430835e-05, "loss": 2.926, "step": 669000 }, { "epoch": 1.4014720166125196, "grad_norm": 15.151104927062988, "learning_rate": 4.308283617781805e-05, "loss": 2.9364, "step": 669500 }, { "epoch": 1.402518672338145, "grad_norm": 13.537474632263184, "learning_rate": 4.307759192132776e-05, "loss": 2.9289, "step": 670000 }, { "epoch": 1.4035653280637708, "grad_norm": 21.446514129638672, "learning_rate": 4.307234766483747e-05, "loss": 2.933, "step": 670500 }, { "epoch": 1.4046119837893962, "grad_norm": 28.075950622558594, "learning_rate": 4.306710340834718e-05, "loss": 2.9139, "step": 671000 }, { "epoch": 1.4056586395150217, "grad_norm": 21.27945327758789, "learning_rate": 4.306185915185689e-05, "loss": 2.919, "step": 671500 }, { "epoch": 1.4067052952406471, "grad_norm": 23.552581787109375, "learning_rate": 4.30566148953666e-05, "loss": 2.949, "step": 672000 }, { "epoch": 1.4077519509662726, "grad_norm": 17.992237091064453, "learning_rate": 4.3051370638876305e-05, "loss": 2.9347, "step": 672500 }, { "epoch": 1.408798606691898, "grad_norm": 22.340518951416016, "learning_rate": 4.3046126382386016e-05, "loss": 2.9269, "step": 673000 }, { "epoch": 1.4098452624175235, "grad_norm": 22.367748260498047, "learning_rate": 4.304088212589572e-05, "loss": 2.9195, "step": 673500 }, { "epoch": 1.410891918143149, "grad_norm": 17.664447784423828, "learning_rate": 4.303563786940543e-05, "loss": 2.9124, "step": 674000 }, { "epoch": 1.4119385738687744, "grad_norm": 26.599468231201172, "learning_rate": 4.3030393612915136e-05, "loss": 2.931, "step": 674500 }, { "epoch": 1.4129852295943999, "grad_norm": 14.174084663391113, "learning_rate": 4.302514935642484e-05, "loss": 2.9349, "step": 675000 }, { "epoch": 1.4140318853200253, "grad_norm": 26.087718963623047, "learning_rate": 4.301990509993455e-05, "loss": 2.9305, "step": 675500 }, { "epoch": 1.415078541045651, "grad_norm": 133.5597686767578, "learning_rate": 4.301466084344426e-05, "loss": 2.9439, "step": 676000 }, { "epoch": 1.4161251967712765, "grad_norm": 14.880332946777344, "learning_rate": 4.3009416586953973e-05, "loss": 2.9394, "step": 676500 }, { "epoch": 1.417171852496902, "grad_norm": 39.263187408447266, "learning_rate": 4.300417233046368e-05, "loss": 2.9241, "step": 677000 }, { "epoch": 1.4182185082225274, "grad_norm": 14.035211563110352, "learning_rate": 4.299892807397339e-05, "loss": 2.9232, "step": 677500 }, { "epoch": 1.4192651639481528, "grad_norm": 13.205121994018555, "learning_rate": 4.299368381748309e-05, "loss": 2.9482, "step": 678000 }, { "epoch": 1.4203118196737783, "grad_norm": 25.500457763671875, "learning_rate": 4.2988439560992804e-05, "loss": 2.9416, "step": 678500 }, { "epoch": 1.4213584753994037, "grad_norm": 18.642480850219727, "learning_rate": 4.298319530450251e-05, "loss": 2.9466, "step": 679000 }, { "epoch": 1.4224051311250294, "grad_norm": 13.942402839660645, "learning_rate": 4.297795104801222e-05, "loss": 2.9323, "step": 679500 }, { "epoch": 1.4234517868506549, "grad_norm": 26.655288696289062, "learning_rate": 4.2972706791521924e-05, "loss": 2.9355, "step": 680000 }, { "epoch": 1.4244984425762803, "grad_norm": 17.46067237854004, "learning_rate": 4.2967462535031635e-05, "loss": 2.9478, "step": 680500 }, { "epoch": 1.4255450983019058, "grad_norm": 19.024112701416016, "learning_rate": 4.2962218278541346e-05, "loss": 2.9437, "step": 681000 }, { "epoch": 1.4265917540275312, "grad_norm": 13.938742637634277, "learning_rate": 4.295697402205105e-05, "loss": 2.9387, "step": 681500 }, { "epoch": 1.4276384097531567, "grad_norm": 13.60888671875, "learning_rate": 4.295172976556076e-05, "loss": 2.9311, "step": 682000 }, { "epoch": 1.4286850654787822, "grad_norm": 14.706408500671387, "learning_rate": 4.2946485509070466e-05, "loss": 2.9219, "step": 682500 }, { "epoch": 1.4297317212044076, "grad_norm": 23.862640380859375, "learning_rate": 4.294124125258018e-05, "loss": 2.9177, "step": 683000 }, { "epoch": 1.430778376930033, "grad_norm": 13.935612678527832, "learning_rate": 4.293599699608988e-05, "loss": 2.9292, "step": 683500 }, { "epoch": 1.4318250326556585, "grad_norm": 17.506641387939453, "learning_rate": 4.293075273959959e-05, "loss": 2.9176, "step": 684000 }, { "epoch": 1.4328716883812842, "grad_norm": 20.36502456665039, "learning_rate": 4.29255084831093e-05, "loss": 2.9399, "step": 684500 }, { "epoch": 1.4339183441069097, "grad_norm": 20.9641170501709, "learning_rate": 4.2920264226619014e-05, "loss": 2.9051, "step": 685000 }, { "epoch": 1.4349649998325351, "grad_norm": 13.461139678955078, "learning_rate": 4.291501997012872e-05, "loss": 2.9269, "step": 685500 }, { "epoch": 1.4360116555581606, "grad_norm": 16.350236892700195, "learning_rate": 4.290977571363842e-05, "loss": 2.9272, "step": 686000 }, { "epoch": 1.437058311283786, "grad_norm": 21.412010192871094, "learning_rate": 4.2904531457148134e-05, "loss": 2.9412, "step": 686500 }, { "epoch": 1.4381049670094115, "grad_norm": 15.718482971191406, "learning_rate": 4.289928720065784e-05, "loss": 2.9021, "step": 687000 }, { "epoch": 1.439151622735037, "grad_norm": 14.883883476257324, "learning_rate": 4.289404294416755e-05, "loss": 2.895, "step": 687500 }, { "epoch": 1.4401982784606626, "grad_norm": 62.303646087646484, "learning_rate": 4.2888798687677254e-05, "loss": 2.9262, "step": 688000 }, { "epoch": 1.441244934186288, "grad_norm": 14.69914436340332, "learning_rate": 4.2883554431186965e-05, "loss": 2.9005, "step": 688500 }, { "epoch": 1.4422915899119135, "grad_norm": 15.396275520324707, "learning_rate": 4.2878310174696676e-05, "loss": 2.9152, "step": 689000 }, { "epoch": 1.443338245637539, "grad_norm": 13.945470809936523, "learning_rate": 4.287306591820639e-05, "loss": 2.9141, "step": 689500 }, { "epoch": 1.4443849013631644, "grad_norm": 34.46780776977539, "learning_rate": 4.286782166171609e-05, "loss": 2.9243, "step": 690000 }, { "epoch": 1.44543155708879, "grad_norm": 19.499616622924805, "learning_rate": 4.28625774052258e-05, "loss": 2.9031, "step": 690500 }, { "epoch": 1.4464782128144154, "grad_norm": 16.21074104309082, "learning_rate": 4.285733314873551e-05, "loss": 2.9167, "step": 691000 }, { "epoch": 1.4475248685400408, "grad_norm": 13.872232437133789, "learning_rate": 4.285208889224521e-05, "loss": 2.9223, "step": 691500 }, { "epoch": 1.4485715242656663, "grad_norm": 11.96998119354248, "learning_rate": 4.284684463575492e-05, "loss": 2.9236, "step": 692000 }, { "epoch": 1.4496181799912917, "grad_norm": 11.970244407653809, "learning_rate": 4.2841600379264626e-05, "loss": 2.924, "step": 692500 }, { "epoch": 1.4506648357169172, "grad_norm": 18.264015197753906, "learning_rate": 4.283635612277434e-05, "loss": 2.9495, "step": 693000 }, { "epoch": 1.4517114914425429, "grad_norm": 15.941812515258789, "learning_rate": 4.283111186628405e-05, "loss": 2.9322, "step": 693500 }, { "epoch": 1.4527581471681683, "grad_norm": 16.078153610229492, "learning_rate": 4.282586760979376e-05, "loss": 2.9138, "step": 694000 }, { "epoch": 1.4538048028937938, "grad_norm": 15.191886901855469, "learning_rate": 4.2820623353303464e-05, "loss": 2.9117, "step": 694500 }, { "epoch": 1.4548514586194192, "grad_norm": 16.033891677856445, "learning_rate": 4.2815379096813175e-05, "loss": 2.915, "step": 695000 }, { "epoch": 1.4558981143450447, "grad_norm": 14.22435474395752, "learning_rate": 4.281013484032288e-05, "loss": 2.9125, "step": 695500 }, { "epoch": 1.4569447700706701, "grad_norm": 40.06970977783203, "learning_rate": 4.280489058383259e-05, "loss": 2.9115, "step": 696000 }, { "epoch": 1.4579914257962958, "grad_norm": 17.466670989990234, "learning_rate": 4.2799646327342295e-05, "loss": 2.9191, "step": 696500 }, { "epoch": 1.4590380815219213, "grad_norm": 25.307153701782227, "learning_rate": 4.2794402070852006e-05, "loss": 2.88, "step": 697000 }, { "epoch": 1.4600847372475467, "grad_norm": 14.659607887268066, "learning_rate": 4.278915781436171e-05, "loss": 2.9137, "step": 697500 }, { "epoch": 1.4611313929731722, "grad_norm": 15.87960433959961, "learning_rate": 4.278391355787142e-05, "loss": 2.9153, "step": 698000 }, { "epoch": 1.4621780486987976, "grad_norm": 17.831218719482422, "learning_rate": 4.277866930138113e-05, "loss": 2.9097, "step": 698500 }, { "epoch": 1.463224704424423, "grad_norm": 12.554692268371582, "learning_rate": 4.2773425044890836e-05, "loss": 2.9104, "step": 699000 }, { "epoch": 1.4642713601500486, "grad_norm": 17.312501907348633, "learning_rate": 4.276818078840055e-05, "loss": 2.9103, "step": 699500 }, { "epoch": 1.465318015875674, "grad_norm": 13.694412231445312, "learning_rate": 4.276293653191025e-05, "loss": 2.9155, "step": 700000 }, { "epoch": 1.4663646716012995, "grad_norm": 14.71805191040039, "learning_rate": 4.275769227541996e-05, "loss": 2.8943, "step": 700500 }, { "epoch": 1.467411327326925, "grad_norm": 15.44802474975586, "learning_rate": 4.275244801892967e-05, "loss": 2.9338, "step": 701000 }, { "epoch": 1.4684579830525504, "grad_norm": 11.197710037231445, "learning_rate": 4.274720376243938e-05, "loss": 2.897, "step": 701500 }, { "epoch": 1.469504638778176, "grad_norm": 13.563399314880371, "learning_rate": 4.274195950594909e-05, "loss": 2.9143, "step": 702000 }, { "epoch": 1.4705512945038015, "grad_norm": 17.40575408935547, "learning_rate": 4.27367152494588e-05, "loss": 2.917, "step": 702500 }, { "epoch": 1.471597950229427, "grad_norm": 12.317706108093262, "learning_rate": 4.2731470992968505e-05, "loss": 2.922, "step": 703000 }, { "epoch": 1.4726446059550524, "grad_norm": 70.7812271118164, "learning_rate": 4.272622673647821e-05, "loss": 2.8952, "step": 703500 }, { "epoch": 1.4736912616806779, "grad_norm": 16.612009048461914, "learning_rate": 4.272098247998792e-05, "loss": 2.8899, "step": 704000 }, { "epoch": 1.4747379174063033, "grad_norm": 13.369139671325684, "learning_rate": 4.2715738223497624e-05, "loss": 2.8798, "step": 704500 }, { "epoch": 1.4757845731319288, "grad_norm": 20.52071189880371, "learning_rate": 4.2710493967007336e-05, "loss": 2.8936, "step": 705000 }, { "epoch": 1.4768312288575545, "grad_norm": 33.9241828918457, "learning_rate": 4.270524971051704e-05, "loss": 2.8961, "step": 705500 }, { "epoch": 1.47787788458318, "grad_norm": 122.27322387695312, "learning_rate": 4.270000545402675e-05, "loss": 2.9245, "step": 706000 }, { "epoch": 1.4789245403088054, "grad_norm": 17.287372589111328, "learning_rate": 4.269476119753646e-05, "loss": 2.8897, "step": 706500 }, { "epoch": 1.4799711960344308, "grad_norm": 19.36941909790039, "learning_rate": 4.268951694104617e-05, "loss": 2.8979, "step": 707000 }, { "epoch": 1.4810178517600563, "grad_norm": 13.853858947753906, "learning_rate": 4.268427268455588e-05, "loss": 2.9125, "step": 707500 }, { "epoch": 1.4820645074856817, "grad_norm": 13.141562461853027, "learning_rate": 4.267902842806559e-05, "loss": 2.8973, "step": 708000 }, { "epoch": 1.4831111632113072, "grad_norm": 17.443998336791992, "learning_rate": 4.267378417157529e-05, "loss": 2.9045, "step": 708500 }, { "epoch": 1.4841578189369327, "grad_norm": 13.166300773620605, "learning_rate": 4.2668539915085e-05, "loss": 2.9023, "step": 709000 }, { "epoch": 1.4852044746625581, "grad_norm": 13.745686531066895, "learning_rate": 4.266329565859471e-05, "loss": 2.899, "step": 709500 }, { "epoch": 1.4862511303881836, "grad_norm": 12.661431312561035, "learning_rate": 4.265805140210441e-05, "loss": 2.899, "step": 710000 }, { "epoch": 1.487297786113809, "grad_norm": 24.904247283935547, "learning_rate": 4.2652807145614124e-05, "loss": 2.9079, "step": 710500 }, { "epoch": 1.4883444418394347, "grad_norm": 21.60515785217285, "learning_rate": 4.2647562889123835e-05, "loss": 2.9024, "step": 711000 }, { "epoch": 1.4893910975650602, "grad_norm": 13.360514640808105, "learning_rate": 4.2642318632633546e-05, "loss": 2.9213, "step": 711500 }, { "epoch": 1.4904377532906856, "grad_norm": 15.483306884765625, "learning_rate": 4.263707437614325e-05, "loss": 2.8887, "step": 712000 }, { "epoch": 1.491484409016311, "grad_norm": 14.647512435913086, "learning_rate": 4.263183011965296e-05, "loss": 2.8888, "step": 712500 }, { "epoch": 1.4925310647419365, "grad_norm": 12.203367233276367, "learning_rate": 4.2626585863162665e-05, "loss": 2.894, "step": 713000 }, { "epoch": 1.493577720467562, "grad_norm": 14.00558090209961, "learning_rate": 4.2621341606672376e-05, "loss": 2.892, "step": 713500 }, { "epoch": 1.4946243761931877, "grad_norm": 41.41148376464844, "learning_rate": 4.261609735018208e-05, "loss": 2.9335, "step": 714000 }, { "epoch": 1.4956710319188131, "grad_norm": 16.102394104003906, "learning_rate": 4.2610853093691785e-05, "loss": 2.9051, "step": 714500 }, { "epoch": 1.4967176876444386, "grad_norm": 11.68280029296875, "learning_rate": 4.2605608837201496e-05, "loss": 2.9063, "step": 715000 }, { "epoch": 1.497764343370064, "grad_norm": 16.01552963256836, "learning_rate": 4.260036458071121e-05, "loss": 2.9068, "step": 715500 }, { "epoch": 1.4988109990956895, "grad_norm": 15.057000160217285, "learning_rate": 4.259512032422092e-05, "loss": 2.8893, "step": 716000 }, { "epoch": 1.499857654821315, "grad_norm": 13.23488998413086, "learning_rate": 4.258987606773062e-05, "loss": 2.9181, "step": 716500 }, { "epoch": 1.5009043105469404, "grad_norm": 13.608189582824707, "learning_rate": 4.2584631811240334e-05, "loss": 2.885, "step": 717000 }, { "epoch": 1.5019509662725659, "grad_norm": 12.806418418884277, "learning_rate": 4.257938755475004e-05, "loss": 2.9121, "step": 717500 }, { "epoch": 1.5029976219981913, "grad_norm": 23.46586799621582, "learning_rate": 4.257414329825975e-05, "loss": 2.8985, "step": 718000 }, { "epoch": 1.5040442777238168, "grad_norm": 15.249083518981934, "learning_rate": 4.2568899041769453e-05, "loss": 2.8954, "step": 718500 }, { "epoch": 1.5050909334494422, "grad_norm": 12.297357559204102, "learning_rate": 4.2563654785279164e-05, "loss": 2.882, "step": 719000 }, { "epoch": 1.5061375891750677, "grad_norm": 14.218979835510254, "learning_rate": 4.2558410528788876e-05, "loss": 2.9022, "step": 719500 }, { "epoch": 1.5071842449006934, "grad_norm": 11.753341674804688, "learning_rate": 4.255316627229858e-05, "loss": 2.8775, "step": 720000 }, { "epoch": 1.5082309006263188, "grad_norm": 14.631226539611816, "learning_rate": 4.254792201580829e-05, "loss": 2.8803, "step": 720500 }, { "epoch": 1.5092775563519443, "grad_norm": 23.92547035217285, "learning_rate": 4.2542677759317995e-05, "loss": 2.8879, "step": 721000 }, { "epoch": 1.5103242120775697, "grad_norm": 12.717313766479492, "learning_rate": 4.2537433502827706e-05, "loss": 2.8837, "step": 721500 }, { "epoch": 1.5113708678031952, "grad_norm": 115.02717590332031, "learning_rate": 4.253218924633741e-05, "loss": 2.8723, "step": 722000 }, { "epoch": 1.5124175235288209, "grad_norm": 13.995833396911621, "learning_rate": 4.252694498984712e-05, "loss": 2.8766, "step": 722500 }, { "epoch": 1.5134641792544463, "grad_norm": 15.411415100097656, "learning_rate": 4.2521700733356826e-05, "loss": 2.8969, "step": 723000 }, { "epoch": 1.5145108349800718, "grad_norm": 20.12826156616211, "learning_rate": 4.251645647686654e-05, "loss": 2.9085, "step": 723500 }, { "epoch": 1.5155574907056972, "grad_norm": 15.859312057495117, "learning_rate": 4.251121222037625e-05, "loss": 2.8984, "step": 724000 }, { "epoch": 1.5166041464313227, "grad_norm": 18.323270797729492, "learning_rate": 4.250596796388596e-05, "loss": 2.8864, "step": 724500 }, { "epoch": 1.5176508021569481, "grad_norm": 20.153303146362305, "learning_rate": 4.2500723707395664e-05, "loss": 2.8909, "step": 725000 }, { "epoch": 1.5186974578825736, "grad_norm": 28.872766494750977, "learning_rate": 4.249547945090537e-05, "loss": 2.8924, "step": 725500 }, { "epoch": 1.519744113608199, "grad_norm": 12.604666709899902, "learning_rate": 4.249023519441508e-05, "loss": 2.9026, "step": 726000 }, { "epoch": 1.5207907693338245, "grad_norm": 21.897659301757812, "learning_rate": 4.248499093792478e-05, "loss": 2.9029, "step": 726500 }, { "epoch": 1.52183742505945, "grad_norm": 19.126192092895508, "learning_rate": 4.2479746681434494e-05, "loss": 2.8948, "step": 727000 }, { "epoch": 1.5228840807850754, "grad_norm": 20.493392944335938, "learning_rate": 4.24745024249442e-05, "loss": 2.9093, "step": 727500 }, { "epoch": 1.5239307365107009, "grad_norm": 155.40965270996094, "learning_rate": 4.246925816845391e-05, "loss": 2.8996, "step": 728000 }, { "epoch": 1.5249773922363263, "grad_norm": 17.865053176879883, "learning_rate": 4.246401391196362e-05, "loss": 2.8716, "step": 728500 }, { "epoch": 1.526024047961952, "grad_norm": 35.551910400390625, "learning_rate": 4.245876965547333e-05, "loss": 2.885, "step": 729000 }, { "epoch": 1.5270707036875775, "grad_norm": 16.795406341552734, "learning_rate": 4.2453525398983036e-05, "loss": 2.9024, "step": 729500 }, { "epoch": 1.528117359413203, "grad_norm": 14.00898265838623, "learning_rate": 4.244828114249275e-05, "loss": 2.8911, "step": 730000 }, { "epoch": 1.5291640151388284, "grad_norm": 85.49053955078125, "learning_rate": 4.244303688600245e-05, "loss": 2.9007, "step": 730500 }, { "epoch": 1.530210670864454, "grad_norm": 15.37946605682373, "learning_rate": 4.243779262951216e-05, "loss": 2.899, "step": 731000 }, { "epoch": 1.5312573265900795, "grad_norm": 17.834266662597656, "learning_rate": 4.243254837302187e-05, "loss": 2.9133, "step": 731500 }, { "epoch": 1.532303982315705, "grad_norm": 21.968175888061523, "learning_rate": 4.242730411653157e-05, "loss": 2.8864, "step": 732000 }, { "epoch": 1.5333506380413304, "grad_norm": 13.913899421691895, "learning_rate": 4.242205986004128e-05, "loss": 2.9038, "step": 732500 }, { "epoch": 1.5343972937669559, "grad_norm": 224.1530303955078, "learning_rate": 4.241681560355099e-05, "loss": 2.8807, "step": 733000 }, { "epoch": 1.5354439494925813, "grad_norm": 18.07832145690918, "learning_rate": 4.2411571347060704e-05, "loss": 2.8996, "step": 733500 }, { "epoch": 1.5364906052182068, "grad_norm": 12.360515594482422, "learning_rate": 4.240632709057041e-05, "loss": 2.8954, "step": 734000 }, { "epoch": 1.5375372609438323, "grad_norm": 84.62981414794922, "learning_rate": 4.240108283408012e-05, "loss": 2.8894, "step": 734500 }, { "epoch": 1.5385839166694577, "grad_norm": 24.28133201599121, "learning_rate": 4.2395838577589824e-05, "loss": 2.8636, "step": 735000 }, { "epoch": 1.5396305723950832, "grad_norm": 11.94083023071289, "learning_rate": 4.2390594321099535e-05, "loss": 2.904, "step": 735500 }, { "epoch": 1.5406772281207086, "grad_norm": 16.09222412109375, "learning_rate": 4.238535006460924e-05, "loss": 2.8919, "step": 736000 }, { "epoch": 1.541723883846334, "grad_norm": 13.212508201599121, "learning_rate": 4.238010580811895e-05, "loss": 2.8951, "step": 736500 }, { "epoch": 1.5427705395719595, "grad_norm": 14.54599666595459, "learning_rate": 4.237486155162866e-05, "loss": 2.8977, "step": 737000 }, { "epoch": 1.5438171952975852, "grad_norm": 15.111337661743164, "learning_rate": 4.2369617295138366e-05, "loss": 2.8905, "step": 737500 }, { "epoch": 1.5448638510232107, "grad_norm": 15.550000190734863, "learning_rate": 4.236437303864808e-05, "loss": 2.8815, "step": 738000 }, { "epoch": 1.5459105067488361, "grad_norm": 16.581403732299805, "learning_rate": 4.235912878215778e-05, "loss": 2.881, "step": 738500 }, { "epoch": 1.5469571624744616, "grad_norm": 17.43822479248047, "learning_rate": 4.235388452566749e-05, "loss": 2.9023, "step": 739000 }, { "epoch": 1.548003818200087, "grad_norm": 23.16053581237793, "learning_rate": 4.23486402691772e-05, "loss": 2.8699, "step": 739500 }, { "epoch": 1.5490504739257127, "grad_norm": 19.24244499206543, "learning_rate": 4.234339601268691e-05, "loss": 2.8853, "step": 740000 }, { "epoch": 1.5500971296513382, "grad_norm": 13.51376724243164, "learning_rate": 4.233815175619661e-05, "loss": 2.8759, "step": 740500 }, { "epoch": 1.5511437853769636, "grad_norm": 13.80211067199707, "learning_rate": 4.233290749970632e-05, "loss": 2.9068, "step": 741000 }, { "epoch": 1.552190441102589, "grad_norm": 12.631048202514648, "learning_rate": 4.2327663243216034e-05, "loss": 2.8955, "step": 741500 }, { "epoch": 1.5532370968282145, "grad_norm": 33.31592559814453, "learning_rate": 4.2322418986725745e-05, "loss": 2.9115, "step": 742000 }, { "epoch": 1.55428375255384, "grad_norm": 17.53080177307129, "learning_rate": 4.231717473023545e-05, "loss": 2.88, "step": 742500 }, { "epoch": 1.5553304082794654, "grad_norm": 13.151100158691406, "learning_rate": 4.2311930473745154e-05, "loss": 2.8926, "step": 743000 }, { "epoch": 1.556377064005091, "grad_norm": 46.055355072021484, "learning_rate": 4.2306686217254865e-05, "loss": 2.8704, "step": 743500 }, { "epoch": 1.5574237197307164, "grad_norm": 24.54062271118164, "learning_rate": 4.230144196076457e-05, "loss": 2.8842, "step": 744000 }, { "epoch": 1.5584703754563418, "grad_norm": 14.626265525817871, "learning_rate": 4.229619770427428e-05, "loss": 2.8903, "step": 744500 }, { "epoch": 1.5595170311819673, "grad_norm": 15.873600006103516, "learning_rate": 4.2290953447783985e-05, "loss": 2.8851, "step": 745000 }, { "epoch": 1.5605636869075927, "grad_norm": 14.838785171508789, "learning_rate": 4.2285709191293696e-05, "loss": 2.8926, "step": 745500 }, { "epoch": 1.5616103426332182, "grad_norm": 14.447505950927734, "learning_rate": 4.228046493480341e-05, "loss": 2.9083, "step": 746000 }, { "epoch": 1.5626569983588439, "grad_norm": 17.263473510742188, "learning_rate": 4.227522067831312e-05, "loss": 2.9091, "step": 746500 }, { "epoch": 1.5637036540844693, "grad_norm": 20.517009735107422, "learning_rate": 4.226997642182282e-05, "loss": 2.8767, "step": 747000 }, { "epoch": 1.5647503098100948, "grad_norm": 25.32591438293457, "learning_rate": 4.226473216533253e-05, "loss": 2.8782, "step": 747500 }, { "epoch": 1.5657969655357202, "grad_norm": 14.034526824951172, "learning_rate": 4.225948790884224e-05, "loss": 2.8769, "step": 748000 }, { "epoch": 1.566843621261346, "grad_norm": 15.001214027404785, "learning_rate": 4.225424365235194e-05, "loss": 2.8863, "step": 748500 }, { "epoch": 1.5678902769869714, "grad_norm": 15.80793571472168, "learning_rate": 4.224899939586165e-05, "loss": 2.8823, "step": 749000 }, { "epoch": 1.5689369327125968, "grad_norm": 12.958271026611328, "learning_rate": 4.224375513937136e-05, "loss": 2.892, "step": 749500 }, { "epoch": 1.5699835884382223, "grad_norm": 12.871553421020508, "learning_rate": 4.223851088288107e-05, "loss": 2.8841, "step": 750000 }, { "epoch": 1.5710302441638477, "grad_norm": 16.740089416503906, "learning_rate": 4.223326662639078e-05, "loss": 2.8828, "step": 750500 }, { "epoch": 1.5720768998894732, "grad_norm": 25.5275936126709, "learning_rate": 4.222802236990049e-05, "loss": 2.8696, "step": 751000 }, { "epoch": 1.5731235556150986, "grad_norm": 13.446409225463867, "learning_rate": 4.2222778113410195e-05, "loss": 2.8787, "step": 751500 }, { "epoch": 1.574170211340724, "grad_norm": 13.520666122436523, "learning_rate": 4.2217533856919906e-05, "loss": 2.8901, "step": 752000 }, { "epoch": 1.5752168670663496, "grad_norm": 12.441044807434082, "learning_rate": 4.221228960042961e-05, "loss": 2.9084, "step": 752500 }, { "epoch": 1.576263522791975, "grad_norm": 13.422410011291504, "learning_rate": 4.220704534393932e-05, "loss": 2.898, "step": 753000 }, { "epoch": 1.5773101785176005, "grad_norm": 13.134127616882324, "learning_rate": 4.2201801087449026e-05, "loss": 2.8778, "step": 753500 }, { "epoch": 1.578356834243226, "grad_norm": 14.41052532196045, "learning_rate": 4.219655683095874e-05, "loss": 2.9056, "step": 754000 }, { "epoch": 1.5794034899688514, "grad_norm": 12.30474853515625, "learning_rate": 4.219131257446845e-05, "loss": 2.8727, "step": 754500 }, { "epoch": 1.580450145694477, "grad_norm": 21.677478790283203, "learning_rate": 4.218606831797815e-05, "loss": 2.8758, "step": 755000 }, { "epoch": 1.5814968014201025, "grad_norm": 13.449736595153809, "learning_rate": 4.218082406148786e-05, "loss": 2.874, "step": 755500 }, { "epoch": 1.582543457145728, "grad_norm": 11.765925407409668, "learning_rate": 4.217557980499757e-05, "loss": 2.8801, "step": 756000 }, { "epoch": 1.5835901128713534, "grad_norm": 21.297134399414062, "learning_rate": 4.217033554850728e-05, "loss": 2.8855, "step": 756500 }, { "epoch": 1.5846367685969789, "grad_norm": 12.846230506896973, "learning_rate": 4.216509129201698e-05, "loss": 2.8865, "step": 757000 }, { "epoch": 1.5856834243226046, "grad_norm": 83.09071350097656, "learning_rate": 4.2159847035526694e-05, "loss": 2.879, "step": 757500 }, { "epoch": 1.58673008004823, "grad_norm": 14.1683988571167, "learning_rate": 4.21546027790364e-05, "loss": 2.884, "step": 758000 }, { "epoch": 1.5877767357738555, "grad_norm": 16.227764129638672, "learning_rate": 4.214935852254611e-05, "loss": 2.8892, "step": 758500 }, { "epoch": 1.588823391499481, "grad_norm": 13.156753540039062, "learning_rate": 4.214411426605582e-05, "loss": 2.8821, "step": 759000 }, { "epoch": 1.5898700472251064, "grad_norm": 14.169858932495117, "learning_rate": 4.2138870009565525e-05, "loss": 2.8575, "step": 759500 }, { "epoch": 1.5909167029507318, "grad_norm": 18.173168182373047, "learning_rate": 4.2133625753075236e-05, "loss": 2.8641, "step": 760000 }, { "epoch": 1.5919633586763573, "grad_norm": 15.177206993103027, "learning_rate": 4.212838149658494e-05, "loss": 2.8768, "step": 760500 }, { "epoch": 1.5930100144019828, "grad_norm": 13.034826278686523, "learning_rate": 4.212313724009465e-05, "loss": 2.8781, "step": 761000 }, { "epoch": 1.5940566701276082, "grad_norm": 12.146944046020508, "learning_rate": 4.2117892983604355e-05, "loss": 2.8557, "step": 761500 }, { "epoch": 1.5951033258532337, "grad_norm": 25.86343002319336, "learning_rate": 4.2112648727114067e-05, "loss": 2.8807, "step": 762000 }, { "epoch": 1.5961499815788591, "grad_norm": 15.404720306396484, "learning_rate": 4.210740447062377e-05, "loss": 2.8712, "step": 762500 }, { "epoch": 1.5971966373044846, "grad_norm": 16.64784049987793, "learning_rate": 4.210216021413348e-05, "loss": 2.8784, "step": 763000 }, { "epoch": 1.59824329303011, "grad_norm": 16.08076286315918, "learning_rate": 4.209691595764319e-05, "loss": 2.879, "step": 763500 }, { "epoch": 1.5992899487557357, "grad_norm": 44.233272552490234, "learning_rate": 4.2091671701152904e-05, "loss": 2.8643, "step": 764000 }, { "epoch": 1.6003366044813612, "grad_norm": 14.431419372558594, "learning_rate": 4.208642744466261e-05, "loss": 2.8638, "step": 764500 }, { "epoch": 1.6013832602069866, "grad_norm": 12.80484676361084, "learning_rate": 4.208118318817232e-05, "loss": 2.88, "step": 765000 }, { "epoch": 1.602429915932612, "grad_norm": 14.048552513122559, "learning_rate": 4.2075938931682024e-05, "loss": 2.87, "step": 765500 }, { "epoch": 1.6034765716582378, "grad_norm": 15.768491744995117, "learning_rate": 4.207069467519173e-05, "loss": 2.8691, "step": 766000 }, { "epoch": 1.6045232273838632, "grad_norm": 13.771637916564941, "learning_rate": 4.206545041870144e-05, "loss": 2.877, "step": 766500 }, { "epoch": 1.6055698831094887, "grad_norm": 13.218389511108398, "learning_rate": 4.2060206162211143e-05, "loss": 2.8773, "step": 767000 }, { "epoch": 1.6066165388351141, "grad_norm": 16.89997673034668, "learning_rate": 4.2054961905720855e-05, "loss": 2.8884, "step": 767500 }, { "epoch": 1.6076631945607396, "grad_norm": 14.062873840332031, "learning_rate": 4.2049717649230566e-05, "loss": 2.8749, "step": 768000 }, { "epoch": 1.608709850286365, "grad_norm": 11.76429271697998, "learning_rate": 4.204447339274028e-05, "loss": 2.8755, "step": 768500 }, { "epoch": 1.6097565060119905, "grad_norm": 19.128955841064453, "learning_rate": 4.203922913624998e-05, "loss": 2.8432, "step": 769000 }, { "epoch": 1.610803161737616, "grad_norm": 14.08618450164795, "learning_rate": 4.203398487975969e-05, "loss": 2.8891, "step": 769500 }, { "epoch": 1.6118498174632414, "grad_norm": 28.659317016601562, "learning_rate": 4.2028740623269396e-05, "loss": 2.8853, "step": 770000 }, { "epoch": 1.6128964731888669, "grad_norm": 14.165172576904297, "learning_rate": 4.202349636677911e-05, "loss": 2.8718, "step": 770500 }, { "epoch": 1.6139431289144923, "grad_norm": 32.53348159790039, "learning_rate": 4.201825211028881e-05, "loss": 2.8787, "step": 771000 }, { "epoch": 1.6149897846401178, "grad_norm": 11.978403091430664, "learning_rate": 4.2013007853798516e-05, "loss": 2.8429, "step": 771500 }, { "epoch": 1.6160364403657432, "grad_norm": 11.660234451293945, "learning_rate": 4.2007763597308234e-05, "loss": 2.8571, "step": 772000 }, { "epoch": 1.617083096091369, "grad_norm": 14.337581634521484, "learning_rate": 4.200251934081794e-05, "loss": 2.8748, "step": 772500 }, { "epoch": 1.6181297518169944, "grad_norm": 12.763467788696289, "learning_rate": 4.199727508432765e-05, "loss": 2.8621, "step": 773000 }, { "epoch": 1.6191764075426198, "grad_norm": 14.221198081970215, "learning_rate": 4.1992030827837354e-05, "loss": 2.8734, "step": 773500 }, { "epoch": 1.6202230632682453, "grad_norm": 17.9761905670166, "learning_rate": 4.1986786571347065e-05, "loss": 2.8921, "step": 774000 }, { "epoch": 1.6212697189938707, "grad_norm": 13.511699676513672, "learning_rate": 4.198154231485677e-05, "loss": 2.8523, "step": 774500 }, { "epoch": 1.6223163747194964, "grad_norm": 15.477879524230957, "learning_rate": 4.197629805836648e-05, "loss": 2.8853, "step": 775000 }, { "epoch": 1.6233630304451219, "grad_norm": 38.0817985534668, "learning_rate": 4.1971053801876184e-05, "loss": 2.8699, "step": 775500 }, { "epoch": 1.6244096861707473, "grad_norm": 14.9390869140625, "learning_rate": 4.1965809545385895e-05, "loss": 2.8616, "step": 776000 }, { "epoch": 1.6254563418963728, "grad_norm": 21.285751342773438, "learning_rate": 4.1960565288895607e-05, "loss": 2.8545, "step": 776500 }, { "epoch": 1.6265029976219982, "grad_norm": 13.06883430480957, "learning_rate": 4.195532103240531e-05, "loss": 2.864, "step": 777000 }, { "epoch": 1.6275496533476237, "grad_norm": 13.560805320739746, "learning_rate": 4.195007677591502e-05, "loss": 2.8553, "step": 777500 }, { "epoch": 1.6285963090732491, "grad_norm": 14.778905868530273, "learning_rate": 4.1944832519424726e-05, "loss": 2.88, "step": 778000 }, { "epoch": 1.6296429647988746, "grad_norm": 14.232298851013184, "learning_rate": 4.193958826293444e-05, "loss": 2.8807, "step": 778500 }, { "epoch": 1.6306896205245, "grad_norm": 12.102797508239746, "learning_rate": 4.193434400644414e-05, "loss": 2.8488, "step": 779000 }, { "epoch": 1.6317362762501255, "grad_norm": 14.43665599822998, "learning_rate": 4.192909974995385e-05, "loss": 2.8477, "step": 779500 }, { "epoch": 1.632782931975751, "grad_norm": 20.349796295166016, "learning_rate": 4.192385549346356e-05, "loss": 2.8537, "step": 780000 }, { "epoch": 1.6338295877013764, "grad_norm": 11.23513412475586, "learning_rate": 4.191861123697327e-05, "loss": 2.8711, "step": 780500 }, { "epoch": 1.6348762434270019, "grad_norm": 17.000864028930664, "learning_rate": 4.191336698048298e-05, "loss": 2.8442, "step": 781000 }, { "epoch": 1.6359228991526276, "grad_norm": 15.448736190795898, "learning_rate": 4.190812272399269e-05, "loss": 2.8516, "step": 781500 }, { "epoch": 1.636969554878253, "grad_norm": 12.179960250854492, "learning_rate": 4.1902878467502395e-05, "loss": 2.8774, "step": 782000 }, { "epoch": 1.6380162106038785, "grad_norm": 13.026206016540527, "learning_rate": 4.18976342110121e-05, "loss": 2.8472, "step": 782500 }, { "epoch": 1.639062866329504, "grad_norm": 13.19868278503418, "learning_rate": 4.189238995452181e-05, "loss": 2.8594, "step": 783000 }, { "epoch": 1.6401095220551296, "grad_norm": 13.099846839904785, "learning_rate": 4.1887145698031514e-05, "loss": 2.8702, "step": 783500 }, { "epoch": 1.641156177780755, "grad_norm": 12.000506401062012, "learning_rate": 4.1881901441541225e-05, "loss": 2.8894, "step": 784000 }, { "epoch": 1.6422028335063805, "grad_norm": 12.931654930114746, "learning_rate": 4.187665718505093e-05, "loss": 2.8506, "step": 784500 }, { "epoch": 1.643249489232006, "grad_norm": 12.487265586853027, "learning_rate": 4.187141292856064e-05, "loss": 2.8414, "step": 785000 }, { "epoch": 1.6442961449576314, "grad_norm": 12.416121482849121, "learning_rate": 4.186616867207035e-05, "loss": 2.8498, "step": 785500 }, { "epoch": 1.645342800683257, "grad_norm": 140.99575805664062, "learning_rate": 4.186092441558006e-05, "loss": 2.8547, "step": 786000 }, { "epoch": 1.6463894564088823, "grad_norm": 16.387081146240234, "learning_rate": 4.185568015908977e-05, "loss": 2.8548, "step": 786500 }, { "epoch": 1.6474361121345078, "grad_norm": 12.048304557800293, "learning_rate": 4.185043590259948e-05, "loss": 2.8533, "step": 787000 }, { "epoch": 1.6484827678601333, "grad_norm": 17.421005249023438, "learning_rate": 4.184519164610918e-05, "loss": 2.8574, "step": 787500 }, { "epoch": 1.6495294235857587, "grad_norm": 39.99574279785156, "learning_rate": 4.183994738961889e-05, "loss": 2.8679, "step": 788000 }, { "epoch": 1.6505760793113842, "grad_norm": 14.395581245422363, "learning_rate": 4.18347031331286e-05, "loss": 2.8658, "step": 788500 }, { "epoch": 1.6516227350370096, "grad_norm": 16.637266159057617, "learning_rate": 4.18294588766383e-05, "loss": 2.8322, "step": 789000 }, { "epoch": 1.652669390762635, "grad_norm": 12.836480140686035, "learning_rate": 4.182421462014802e-05, "loss": 2.8537, "step": 789500 }, { "epoch": 1.6537160464882608, "grad_norm": 13.014302253723145, "learning_rate": 4.1818970363657724e-05, "loss": 2.8703, "step": 790000 }, { "epoch": 1.6547627022138862, "grad_norm": 14.718812942504883, "learning_rate": 4.1813726107167435e-05, "loss": 2.8723, "step": 790500 }, { "epoch": 1.6558093579395117, "grad_norm": 12.305022239685059, "learning_rate": 4.180848185067714e-05, "loss": 2.8682, "step": 791000 }, { "epoch": 1.6568560136651371, "grad_norm": 18.22999382019043, "learning_rate": 4.180323759418685e-05, "loss": 2.8569, "step": 791500 }, { "epoch": 1.6579026693907626, "grad_norm": 22.253320693969727, "learning_rate": 4.1797993337696555e-05, "loss": 2.8533, "step": 792000 }, { "epoch": 1.6589493251163883, "grad_norm": 19.490814208984375, "learning_rate": 4.1792749081206266e-05, "loss": 2.8571, "step": 792500 }, { "epoch": 1.6599959808420137, "grad_norm": 14.795929908752441, "learning_rate": 4.178750482471597e-05, "loss": 2.8554, "step": 793000 }, { "epoch": 1.6610426365676392, "grad_norm": 13.852795600891113, "learning_rate": 4.178226056822568e-05, "loss": 2.8629, "step": 793500 }, { "epoch": 1.6620892922932646, "grad_norm": 19.342906951904297, "learning_rate": 4.177701631173539e-05, "loss": 2.8645, "step": 794000 }, { "epoch": 1.66313594801889, "grad_norm": 18.114755630493164, "learning_rate": 4.17717720552451e-05, "loss": 2.8639, "step": 794500 }, { "epoch": 1.6641826037445155, "grad_norm": 63.96286392211914, "learning_rate": 4.176652779875481e-05, "loss": 2.8558, "step": 795000 }, { "epoch": 1.665229259470141, "grad_norm": 25.9266300201416, "learning_rate": 4.176128354226451e-05, "loss": 2.8557, "step": 795500 }, { "epoch": 1.6662759151957665, "grad_norm": 13.5536470413208, "learning_rate": 4.1756039285774223e-05, "loss": 2.8587, "step": 796000 }, { "epoch": 1.667322570921392, "grad_norm": 12.51992416381836, "learning_rate": 4.175079502928393e-05, "loss": 2.8586, "step": 796500 }, { "epoch": 1.6683692266470174, "grad_norm": 13.29262638092041, "learning_rate": 4.174555077279364e-05, "loss": 2.8701, "step": 797000 }, { "epoch": 1.6694158823726428, "grad_norm": 19.870441436767578, "learning_rate": 4.174030651630334e-05, "loss": 2.8648, "step": 797500 }, { "epoch": 1.6704625380982683, "grad_norm": 12.337117195129395, "learning_rate": 4.1735062259813054e-05, "loss": 2.8555, "step": 798000 }, { "epoch": 1.6715091938238937, "grad_norm": 12.973998069763184, "learning_rate": 4.1729818003322765e-05, "loss": 2.8711, "step": 798500 }, { "epoch": 1.6725558495495194, "grad_norm": 21.46640968322754, "learning_rate": 4.1724573746832476e-05, "loss": 2.8581, "step": 799000 }, { "epoch": 1.6736025052751449, "grad_norm": 14.099344253540039, "learning_rate": 4.171932949034218e-05, "loss": 2.8568, "step": 799500 }, { "epoch": 1.6746491610007703, "grad_norm": 21.301366806030273, "learning_rate": 4.1714085233851885e-05, "loss": 2.8712, "step": 800000 }, { "epoch": 1.6756958167263958, "grad_norm": 12.643409729003906, "learning_rate": 4.1708840977361596e-05, "loss": 2.8426, "step": 800500 }, { "epoch": 1.6767424724520215, "grad_norm": 12.581509590148926, "learning_rate": 4.17035967208713e-05, "loss": 2.8526, "step": 801000 }, { "epoch": 1.677789128177647, "grad_norm": 29.59674644470215, "learning_rate": 4.169835246438101e-05, "loss": 2.8568, "step": 801500 }, { "epoch": 1.6788357839032724, "grad_norm": 14.645078659057617, "learning_rate": 4.1693108207890716e-05, "loss": 2.869, "step": 802000 }, { "epoch": 1.6798824396288978, "grad_norm": 12.204601287841797, "learning_rate": 4.168786395140043e-05, "loss": 2.862, "step": 802500 }, { "epoch": 1.6809290953545233, "grad_norm": 13.087714195251465, "learning_rate": 4.168261969491014e-05, "loss": 2.8532, "step": 803000 }, { "epoch": 1.6819757510801487, "grad_norm": 13.106924057006836, "learning_rate": 4.167737543841985e-05, "loss": 2.863, "step": 803500 }, { "epoch": 1.6830224068057742, "grad_norm": 12.537446022033691, "learning_rate": 4.167213118192955e-05, "loss": 2.8573, "step": 804000 }, { "epoch": 1.6840690625313997, "grad_norm": 12.808329582214355, "learning_rate": 4.1666886925439264e-05, "loss": 2.8593, "step": 804500 }, { "epoch": 1.685115718257025, "grad_norm": 10.882098197937012, "learning_rate": 4.166164266894897e-05, "loss": 2.8516, "step": 805000 }, { "epoch": 1.6861623739826506, "grad_norm": 14.248497009277344, "learning_rate": 4.165639841245867e-05, "loss": 2.8339, "step": 805500 }, { "epoch": 1.687209029708276, "grad_norm": 11.311664581298828, "learning_rate": 4.1651154155968384e-05, "loss": 2.8472, "step": 806000 }, { "epoch": 1.6882556854339015, "grad_norm": 12.364740371704102, "learning_rate": 4.164590989947809e-05, "loss": 2.8509, "step": 806500 }, { "epoch": 1.689302341159527, "grad_norm": 14.56032657623291, "learning_rate": 4.16406656429878e-05, "loss": 2.8479, "step": 807000 }, { "epoch": 1.6903489968851526, "grad_norm": 11.649042129516602, "learning_rate": 4.163542138649751e-05, "loss": 2.8473, "step": 807500 }, { "epoch": 1.691395652610778, "grad_norm": 12.935726165771484, "learning_rate": 4.163017713000722e-05, "loss": 2.8402, "step": 808000 }, { "epoch": 1.6924423083364035, "grad_norm": 14.551690101623535, "learning_rate": 4.1624932873516926e-05, "loss": 2.8506, "step": 808500 }, { "epoch": 1.693488964062029, "grad_norm": 14.858942031860352, "learning_rate": 4.161968861702664e-05, "loss": 2.8338, "step": 809000 }, { "epoch": 1.6945356197876544, "grad_norm": 13.095915794372559, "learning_rate": 4.161444436053634e-05, "loss": 2.8452, "step": 809500 }, { "epoch": 1.6955822755132801, "grad_norm": 13.186365127563477, "learning_rate": 4.160920010404605e-05, "loss": 2.8478, "step": 810000 }, { "epoch": 1.6966289312389056, "grad_norm": 17.33461570739746, "learning_rate": 4.160395584755576e-05, "loss": 2.8504, "step": 810500 }, { "epoch": 1.697675586964531, "grad_norm": 14.63687801361084, "learning_rate": 4.159871159106547e-05, "loss": 2.8592, "step": 811000 }, { "epoch": 1.6987222426901565, "grad_norm": 29.964635848999023, "learning_rate": 4.159346733457518e-05, "loss": 2.8452, "step": 811500 }, { "epoch": 1.699768898415782, "grad_norm": 16.69788932800293, "learning_rate": 4.158822307808488e-05, "loss": 2.8494, "step": 812000 }, { "epoch": 1.7008155541414074, "grad_norm": 10.293091773986816, "learning_rate": 4.1582978821594594e-05, "loss": 2.8437, "step": 812500 }, { "epoch": 1.7018622098670328, "grad_norm": 12.832427024841309, "learning_rate": 4.15777345651043e-05, "loss": 2.8602, "step": 813000 }, { "epoch": 1.7029088655926583, "grad_norm": 10.74872875213623, "learning_rate": 4.157249030861401e-05, "loss": 2.8582, "step": 813500 }, { "epoch": 1.7039555213182838, "grad_norm": 14.008438110351562, "learning_rate": 4.1567246052123714e-05, "loss": 2.8443, "step": 814000 }, { "epoch": 1.7050021770439092, "grad_norm": 14.210382461547852, "learning_rate": 4.1562001795633425e-05, "loss": 2.8253, "step": 814500 }, { "epoch": 1.7060488327695347, "grad_norm": 24.134014129638672, "learning_rate": 4.155675753914313e-05, "loss": 2.8545, "step": 815000 }, { "epoch": 1.7070954884951601, "grad_norm": 15.446382522583008, "learning_rate": 4.155151328265284e-05, "loss": 2.8534, "step": 815500 }, { "epoch": 1.7081421442207856, "grad_norm": 13.252299308776855, "learning_rate": 4.154626902616255e-05, "loss": 2.8358, "step": 816000 }, { "epoch": 1.7091887999464113, "grad_norm": 14.50414752960205, "learning_rate": 4.1541024769672256e-05, "loss": 2.8257, "step": 816500 }, { "epoch": 1.7102354556720367, "grad_norm": 12.001198768615723, "learning_rate": 4.153578051318197e-05, "loss": 2.8406, "step": 817000 }, { "epoch": 1.7112821113976622, "grad_norm": 13.497008323669434, "learning_rate": 4.153053625669167e-05, "loss": 2.8429, "step": 817500 }, { "epoch": 1.7123287671232876, "grad_norm": 14.006306648254395, "learning_rate": 4.152529200020138e-05, "loss": 2.8355, "step": 818000 }, { "epoch": 1.7133754228489133, "grad_norm": 14.444618225097656, "learning_rate": 4.1520047743711087e-05, "loss": 2.8458, "step": 818500 }, { "epoch": 1.7144220785745388, "grad_norm": 14.769205093383789, "learning_rate": 4.15148034872208e-05, "loss": 2.8456, "step": 819000 }, { "epoch": 1.7154687343001642, "grad_norm": 14.668609619140625, "learning_rate": 4.15095592307305e-05, "loss": 2.8252, "step": 819500 }, { "epoch": 1.7165153900257897, "grad_norm": 13.621663093566895, "learning_rate": 4.150431497424021e-05, "loss": 2.8372, "step": 820000 }, { "epoch": 1.7175620457514151, "grad_norm": 54.42759323120117, "learning_rate": 4.1499070717749924e-05, "loss": 2.8509, "step": 820500 }, { "epoch": 1.7186087014770406, "grad_norm": 13.287393569946289, "learning_rate": 4.1493826461259635e-05, "loss": 2.813, "step": 821000 }, { "epoch": 1.719655357202666, "grad_norm": 18.610671997070312, "learning_rate": 4.148858220476934e-05, "loss": 2.8179, "step": 821500 }, { "epoch": 1.7207020129282915, "grad_norm": 15.503461837768555, "learning_rate": 4.1483337948279044e-05, "loss": 2.8311, "step": 822000 }, { "epoch": 1.721748668653917, "grad_norm": 13.059161186218262, "learning_rate": 4.1478093691788755e-05, "loss": 2.8463, "step": 822500 }, { "epoch": 1.7227953243795424, "grad_norm": 11.715764045715332, "learning_rate": 4.147284943529846e-05, "loss": 2.8451, "step": 823000 }, { "epoch": 1.7238419801051679, "grad_norm": 14.462679862976074, "learning_rate": 4.146760517880817e-05, "loss": 2.8644, "step": 823500 }, { "epoch": 1.7248886358307933, "grad_norm": 15.612573623657227, "learning_rate": 4.1462360922317874e-05, "loss": 2.8177, "step": 824000 }, { "epoch": 1.7259352915564188, "grad_norm": 17.736482620239258, "learning_rate": 4.1457116665827586e-05, "loss": 2.831, "step": 824500 }, { "epoch": 1.7269819472820445, "grad_norm": 12.891092300415039, "learning_rate": 4.14518724093373e-05, "loss": 2.8436, "step": 825000 }, { "epoch": 1.72802860300767, "grad_norm": 20.992923736572266, "learning_rate": 4.144662815284701e-05, "loss": 2.8442, "step": 825500 }, { "epoch": 1.7290752587332954, "grad_norm": 13.188494682312012, "learning_rate": 4.144138389635671e-05, "loss": 2.8196, "step": 826000 }, { "epoch": 1.7301219144589208, "grad_norm": 63.450218200683594, "learning_rate": 4.143613963986642e-05, "loss": 2.8416, "step": 826500 }, { "epoch": 1.7311685701845463, "grad_norm": 12.60846996307373, "learning_rate": 4.143089538337613e-05, "loss": 2.8241, "step": 827000 }, { "epoch": 1.732215225910172, "grad_norm": 14.666458129882812, "learning_rate": 4.142565112688584e-05, "loss": 2.823, "step": 827500 }, { "epoch": 1.7332618816357974, "grad_norm": 41.701507568359375, "learning_rate": 4.142040687039554e-05, "loss": 2.8405, "step": 828000 }, { "epoch": 1.7343085373614229, "grad_norm": 15.282578468322754, "learning_rate": 4.1415162613905254e-05, "loss": 2.8382, "step": 828500 }, { "epoch": 1.7353551930870483, "grad_norm": 13.601941108703613, "learning_rate": 4.1409918357414965e-05, "loss": 2.8366, "step": 829000 }, { "epoch": 1.7364018488126738, "grad_norm": 14.702469825744629, "learning_rate": 4.140467410092467e-05, "loss": 2.8397, "step": 829500 }, { "epoch": 1.7374485045382992, "grad_norm": 13.547781944274902, "learning_rate": 4.139942984443438e-05, "loss": 2.8361, "step": 830000 }, { "epoch": 1.7384951602639247, "grad_norm": 14.676349639892578, "learning_rate": 4.1394185587944085e-05, "loss": 2.8325, "step": 830500 }, { "epoch": 1.7395418159895502, "grad_norm": 13.517903327941895, "learning_rate": 4.1388941331453796e-05, "loss": 2.8337, "step": 831000 }, { "epoch": 1.7405884717151756, "grad_norm": 13.004969596862793, "learning_rate": 4.13836970749635e-05, "loss": 2.8393, "step": 831500 }, { "epoch": 1.741635127440801, "grad_norm": 17.326745986938477, "learning_rate": 4.137845281847321e-05, "loss": 2.8261, "step": 832000 }, { "epoch": 1.7426817831664265, "grad_norm": 17.82477378845215, "learning_rate": 4.1373208561982915e-05, "loss": 2.8371, "step": 832500 }, { "epoch": 1.743728438892052, "grad_norm": 17.451580047607422, "learning_rate": 4.1367964305492626e-05, "loss": 2.8288, "step": 833000 }, { "epoch": 1.7447750946176774, "grad_norm": 14.204693794250488, "learning_rate": 4.136272004900234e-05, "loss": 2.8412, "step": 833500 }, { "epoch": 1.7458217503433031, "grad_norm": 15.106487274169922, "learning_rate": 4.135747579251204e-05, "loss": 2.8463, "step": 834000 }, { "epoch": 1.7468684060689286, "grad_norm": 17.649063110351562, "learning_rate": 4.135223153602175e-05, "loss": 2.8634, "step": 834500 }, { "epoch": 1.747915061794554, "grad_norm": 11.777483940124512, "learning_rate": 4.134698727953146e-05, "loss": 2.8291, "step": 835000 }, { "epoch": 1.7489617175201795, "grad_norm": 14.59937858581543, "learning_rate": 4.134174302304117e-05, "loss": 2.8343, "step": 835500 }, { "epoch": 1.7500083732458052, "grad_norm": 12.527703285217285, "learning_rate": 4.133649876655087e-05, "loss": 2.8483, "step": 836000 }, { "epoch": 1.7510550289714306, "grad_norm": 12.382955551147461, "learning_rate": 4.1331254510060584e-05, "loss": 2.8331, "step": 836500 }, { "epoch": 1.752101684697056, "grad_norm": 13.258570671081543, "learning_rate": 4.132601025357029e-05, "loss": 2.8413, "step": 837000 }, { "epoch": 1.7531483404226815, "grad_norm": 11.720141410827637, "learning_rate": 4.132076599708e-05, "loss": 2.8245, "step": 837500 }, { "epoch": 1.754194996148307, "grad_norm": 14.749502182006836, "learning_rate": 4.131552174058971e-05, "loss": 2.842, "step": 838000 }, { "epoch": 1.7552416518739324, "grad_norm": 51.23678207397461, "learning_rate": 4.131027748409942e-05, "loss": 2.829, "step": 838500 }, { "epoch": 1.756288307599558, "grad_norm": 18.92572784423828, "learning_rate": 4.1305033227609126e-05, "loss": 2.837, "step": 839000 }, { "epoch": 1.7573349633251834, "grad_norm": 13.482343673706055, "learning_rate": 4.129978897111883e-05, "loss": 2.827, "step": 839500 }, { "epoch": 1.7583816190508088, "grad_norm": 13.172675132751465, "learning_rate": 4.129454471462854e-05, "loss": 2.8426, "step": 840000 }, { "epoch": 1.7594282747764343, "grad_norm": 11.793724060058594, "learning_rate": 4.1289300458138245e-05, "loss": 2.8586, "step": 840500 }, { "epoch": 1.7604749305020597, "grad_norm": 14.616333961486816, "learning_rate": 4.1284056201647956e-05, "loss": 2.8272, "step": 841000 }, { "epoch": 1.7615215862276852, "grad_norm": 14.008602142333984, "learning_rate": 4.127881194515766e-05, "loss": 2.8185, "step": 841500 }, { "epoch": 1.7625682419533106, "grad_norm": 14.624236106872559, "learning_rate": 4.127356768866737e-05, "loss": 2.8302, "step": 842000 }, { "epoch": 1.7636148976789363, "grad_norm": 15.337923049926758, "learning_rate": 4.126832343217708e-05, "loss": 2.8498, "step": 842500 }, { "epoch": 1.7646615534045618, "grad_norm": 11.827573776245117, "learning_rate": 4.1263079175686794e-05, "loss": 2.8298, "step": 843000 }, { "epoch": 1.7657082091301872, "grad_norm": 13.03787612915039, "learning_rate": 4.12578349191965e-05, "loss": 2.8233, "step": 843500 }, { "epoch": 1.7667548648558127, "grad_norm": 12.885627746582031, "learning_rate": 4.125259066270621e-05, "loss": 2.8619, "step": 844000 }, { "epoch": 1.7678015205814381, "grad_norm": 11.505794525146484, "learning_rate": 4.1247346406215914e-05, "loss": 2.8063, "step": 844500 }, { "epoch": 1.7688481763070638, "grad_norm": 16.21778678894043, "learning_rate": 4.124210214972562e-05, "loss": 2.8155, "step": 845000 }, { "epoch": 1.7698948320326893, "grad_norm": 12.055370330810547, "learning_rate": 4.123685789323533e-05, "loss": 2.8246, "step": 845500 }, { "epoch": 1.7709414877583147, "grad_norm": 14.430549621582031, "learning_rate": 4.123161363674504e-05, "loss": 2.848, "step": 846000 }, { "epoch": 1.7719881434839402, "grad_norm": 14.95145034790039, "learning_rate": 4.122636938025475e-05, "loss": 2.8382, "step": 846500 }, { "epoch": 1.7730347992095656, "grad_norm": 13.243428230285645, "learning_rate": 4.1221125123764455e-05, "loss": 2.8304, "step": 847000 }, { "epoch": 1.774081454935191, "grad_norm": 11.367372512817383, "learning_rate": 4.1215880867274166e-05, "loss": 2.8419, "step": 847500 }, { "epoch": 1.7751281106608165, "grad_norm": 12.80209732055664, "learning_rate": 4.121063661078387e-05, "loss": 2.8256, "step": 848000 }, { "epoch": 1.776174766386442, "grad_norm": 13.512955665588379, "learning_rate": 4.120539235429358e-05, "loss": 2.8153, "step": 848500 }, { "epoch": 1.7772214221120675, "grad_norm": 12.340147018432617, "learning_rate": 4.1200148097803286e-05, "loss": 2.8269, "step": 849000 }, { "epoch": 1.778268077837693, "grad_norm": 13.027588844299316, "learning_rate": 4.1194903841313e-05, "loss": 2.8159, "step": 849500 }, { "epoch": 1.7793147335633184, "grad_norm": 16.180002212524414, "learning_rate": 4.11896595848227e-05, "loss": 2.8199, "step": 850000 }, { "epoch": 1.7803613892889438, "grad_norm": 13.589118003845215, "learning_rate": 4.118441532833241e-05, "loss": 2.8342, "step": 850500 }, { "epoch": 1.7814080450145693, "grad_norm": 13.584065437316895, "learning_rate": 4.1179171071842124e-05, "loss": 2.8464, "step": 851000 }, { "epoch": 1.782454700740195, "grad_norm": 12.01356029510498, "learning_rate": 4.117392681535183e-05, "loss": 2.8217, "step": 851500 }, { "epoch": 1.7835013564658204, "grad_norm": 11.410931587219238, "learning_rate": 4.116868255886154e-05, "loss": 2.8334, "step": 852000 }, { "epoch": 1.7845480121914459, "grad_norm": 12.02900505065918, "learning_rate": 4.1163438302371243e-05, "loss": 2.8178, "step": 852500 }, { "epoch": 1.7855946679170713, "grad_norm": 14.817281723022461, "learning_rate": 4.1158194045880954e-05, "loss": 2.8232, "step": 853000 }, { "epoch": 1.786641323642697, "grad_norm": 12.961188316345215, "learning_rate": 4.115294978939066e-05, "loss": 2.8278, "step": 853500 }, { "epoch": 1.7876879793683225, "grad_norm": 14.577455520629883, "learning_rate": 4.114770553290037e-05, "loss": 2.834, "step": 854000 }, { "epoch": 1.788734635093948, "grad_norm": 13.457056045532227, "learning_rate": 4.1142461276410074e-05, "loss": 2.8184, "step": 854500 }, { "epoch": 1.7897812908195734, "grad_norm": 15.64820384979248, "learning_rate": 4.1137217019919785e-05, "loss": 2.8156, "step": 855000 }, { "epoch": 1.7908279465451988, "grad_norm": 11.645292282104492, "learning_rate": 4.1131972763429496e-05, "loss": 2.8146, "step": 855500 }, { "epoch": 1.7918746022708243, "grad_norm": 16.474544525146484, "learning_rate": 4.11267285069392e-05, "loss": 2.8363, "step": 856000 }, { "epoch": 1.7929212579964497, "grad_norm": 13.087955474853516, "learning_rate": 4.112148425044891e-05, "loss": 2.7952, "step": 856500 }, { "epoch": 1.7939679137220752, "grad_norm": 13.860799789428711, "learning_rate": 4.1116239993958616e-05, "loss": 2.8412, "step": 857000 }, { "epoch": 1.7950145694477007, "grad_norm": 15.849038124084473, "learning_rate": 4.111099573746833e-05, "loss": 2.8273, "step": 857500 }, { "epoch": 1.7960612251733261, "grad_norm": 12.5424165725708, "learning_rate": 4.110575148097803e-05, "loss": 2.819, "step": 858000 }, { "epoch": 1.7971078808989516, "grad_norm": 13.60267448425293, "learning_rate": 4.110050722448774e-05, "loss": 2.797, "step": 858500 }, { "epoch": 1.798154536624577, "grad_norm": 14.720662117004395, "learning_rate": 4.109526296799745e-05, "loss": 2.8199, "step": 859000 }, { "epoch": 1.7992011923502025, "grad_norm": 15.256004333496094, "learning_rate": 4.109001871150716e-05, "loss": 2.8276, "step": 859500 }, { "epoch": 1.8002478480758282, "grad_norm": 20.957286834716797, "learning_rate": 4.108477445501687e-05, "loss": 2.8358, "step": 860000 }, { "epoch": 1.8012945038014536, "grad_norm": 12.8483304977417, "learning_rate": 4.107953019852658e-05, "loss": 2.8205, "step": 860500 }, { "epoch": 1.802341159527079, "grad_norm": 13.47350025177002, "learning_rate": 4.1074285942036284e-05, "loss": 2.81, "step": 861000 }, { "epoch": 1.8033878152527045, "grad_norm": 11.451008796691895, "learning_rate": 4.1069041685545995e-05, "loss": 2.8152, "step": 861500 }, { "epoch": 1.80443447097833, "grad_norm": 18.77720069885254, "learning_rate": 4.10637974290557e-05, "loss": 2.8176, "step": 862000 }, { "epoch": 1.8054811267039557, "grad_norm": 12.54980182647705, "learning_rate": 4.1058553172565404e-05, "loss": 2.8307, "step": 862500 }, { "epoch": 1.8065277824295811, "grad_norm": 13.897339820861816, "learning_rate": 4.1053308916075115e-05, "loss": 2.8071, "step": 863000 }, { "epoch": 1.8075744381552066, "grad_norm": 12.974977493286133, "learning_rate": 4.1048064659584826e-05, "loss": 2.8144, "step": 863500 }, { "epoch": 1.808621093880832, "grad_norm": 14.40284538269043, "learning_rate": 4.104282040309454e-05, "loss": 2.8273, "step": 864000 }, { "epoch": 1.8096677496064575, "grad_norm": 12.95310115814209, "learning_rate": 4.103757614660424e-05, "loss": 2.8302, "step": 864500 }, { "epoch": 1.810714405332083, "grad_norm": 15.471813201904297, "learning_rate": 4.103233189011395e-05, "loss": 2.8245, "step": 865000 }, { "epoch": 1.8117610610577084, "grad_norm": 13.780024528503418, "learning_rate": 4.102708763362366e-05, "loss": 2.8226, "step": 865500 }, { "epoch": 1.8128077167833339, "grad_norm": 12.580544471740723, "learning_rate": 4.102184337713337e-05, "loss": 2.8193, "step": 866000 }, { "epoch": 1.8138543725089593, "grad_norm": 20.804067611694336, "learning_rate": 4.101659912064307e-05, "loss": 2.8202, "step": 866500 }, { "epoch": 1.8149010282345848, "grad_norm": 25.34147834777832, "learning_rate": 4.101135486415278e-05, "loss": 2.8112, "step": 867000 }, { "epoch": 1.8159476839602102, "grad_norm": 26.059757232666016, "learning_rate": 4.100611060766249e-05, "loss": 2.8137, "step": 867500 }, { "epoch": 1.8169943396858357, "grad_norm": 18.03243637084961, "learning_rate": 4.10008663511722e-05, "loss": 2.8098, "step": 868000 }, { "epoch": 1.8180409954114611, "grad_norm": 14.003212928771973, "learning_rate": 4.099562209468191e-05, "loss": 2.8215, "step": 868500 }, { "epoch": 1.8190876511370868, "grad_norm": 15.684614181518555, "learning_rate": 4.0990377838191614e-05, "loss": 2.8278, "step": 869000 }, { "epoch": 1.8201343068627123, "grad_norm": 14.483017921447754, "learning_rate": 4.0985133581701325e-05, "loss": 2.8371, "step": 869500 }, { "epoch": 1.8211809625883377, "grad_norm": 25.315929412841797, "learning_rate": 4.097988932521103e-05, "loss": 2.8273, "step": 870000 }, { "epoch": 1.8222276183139632, "grad_norm": 12.279741287231445, "learning_rate": 4.097464506872074e-05, "loss": 2.8298, "step": 870500 }, { "epoch": 1.8232742740395889, "grad_norm": 17.805063247680664, "learning_rate": 4.0969400812230445e-05, "loss": 2.8279, "step": 871000 }, { "epoch": 1.8243209297652143, "grad_norm": 12.098453521728516, "learning_rate": 4.0964156555740156e-05, "loss": 2.822, "step": 871500 }, { "epoch": 1.8253675854908398, "grad_norm": 15.295697212219238, "learning_rate": 4.095891229924986e-05, "loss": 2.8209, "step": 872000 }, { "epoch": 1.8264142412164652, "grad_norm": 13.31796932220459, "learning_rate": 4.095366804275957e-05, "loss": 2.825, "step": 872500 }, { "epoch": 1.8274608969420907, "grad_norm": 12.277231216430664, "learning_rate": 4.094842378626928e-05, "loss": 2.8057, "step": 873000 }, { "epoch": 1.8285075526677161, "grad_norm": 12.375404357910156, "learning_rate": 4.094317952977899e-05, "loss": 2.8049, "step": 873500 }, { "epoch": 1.8295542083933416, "grad_norm": 13.330805778503418, "learning_rate": 4.09379352732887e-05, "loss": 2.8194, "step": 874000 }, { "epoch": 1.830600864118967, "grad_norm": 11.163919448852539, "learning_rate": 4.09326910167984e-05, "loss": 2.8136, "step": 874500 }, { "epoch": 1.8316475198445925, "grad_norm": 12.225991249084473, "learning_rate": 4.092744676030811e-05, "loss": 2.8016, "step": 875000 }, { "epoch": 1.832694175570218, "grad_norm": 14.438183784484863, "learning_rate": 4.092220250381782e-05, "loss": 2.7936, "step": 875500 }, { "epoch": 1.8337408312958434, "grad_norm": 14.451757431030273, "learning_rate": 4.091695824732753e-05, "loss": 2.8098, "step": 876000 }, { "epoch": 1.8347874870214689, "grad_norm": 17.09204864501953, "learning_rate": 4.091171399083723e-05, "loss": 2.7986, "step": 876500 }, { "epoch": 1.8358341427470943, "grad_norm": 14.078213691711426, "learning_rate": 4.0906469734346944e-05, "loss": 2.8086, "step": 877000 }, { "epoch": 1.83688079847272, "grad_norm": 11.47950267791748, "learning_rate": 4.0901225477856655e-05, "loss": 2.8342, "step": 877500 }, { "epoch": 1.8379274541983455, "grad_norm": 19.311731338500977, "learning_rate": 4.0895981221366366e-05, "loss": 2.83, "step": 878000 }, { "epoch": 1.838974109923971, "grad_norm": 18.090255737304688, "learning_rate": 4.089073696487607e-05, "loss": 2.8292, "step": 878500 }, { "epoch": 1.8400207656495964, "grad_norm": 16.497549057006836, "learning_rate": 4.0885492708385775e-05, "loss": 2.8257, "step": 879000 }, { "epoch": 1.841067421375222, "grad_norm": 17.800107955932617, "learning_rate": 4.0880248451895486e-05, "loss": 2.823, "step": 879500 }, { "epoch": 1.8421140771008475, "grad_norm": 12.265530586242676, "learning_rate": 4.087500419540519e-05, "loss": 2.8256, "step": 880000 }, { "epoch": 1.843160732826473, "grad_norm": 14.550470352172852, "learning_rate": 4.08697599389149e-05, "loss": 2.8293, "step": 880500 }, { "epoch": 1.8442073885520984, "grad_norm": 13.409524917602539, "learning_rate": 4.086451568242461e-05, "loss": 2.8117, "step": 881000 }, { "epoch": 1.8452540442777239, "grad_norm": 22.57919692993164, "learning_rate": 4.085927142593432e-05, "loss": 2.8125, "step": 881500 }, { "epoch": 1.8463007000033493, "grad_norm": 19.29476547241211, "learning_rate": 4.085402716944403e-05, "loss": 2.8186, "step": 882000 }, { "epoch": 1.8473473557289748, "grad_norm": 12.402812957763672, "learning_rate": 4.084878291295374e-05, "loss": 2.8168, "step": 882500 }, { "epoch": 1.8483940114546002, "grad_norm": 16.018352508544922, "learning_rate": 4.084353865646344e-05, "loss": 2.8045, "step": 883000 }, { "epoch": 1.8494406671802257, "grad_norm": 14.830755233764648, "learning_rate": 4.0838294399973154e-05, "loss": 2.8202, "step": 883500 }, { "epoch": 1.8504873229058512, "grad_norm": 12.382226943969727, "learning_rate": 4.083305014348286e-05, "loss": 2.8094, "step": 884000 }, { "epoch": 1.8515339786314766, "grad_norm": 12.663626670837402, "learning_rate": 4.082780588699256e-05, "loss": 2.8213, "step": 884500 }, { "epoch": 1.852580634357102, "grad_norm": 13.234729766845703, "learning_rate": 4.0822561630502274e-05, "loss": 2.8157, "step": 885000 }, { "epoch": 1.8536272900827275, "grad_norm": 17.377836227416992, "learning_rate": 4.0817317374011985e-05, "loss": 2.8323, "step": 885500 }, { "epoch": 1.854673945808353, "grad_norm": 14.495771408081055, "learning_rate": 4.0812073117521696e-05, "loss": 2.8063, "step": 886000 }, { "epoch": 1.8557206015339787, "grad_norm": 19.80487632751465, "learning_rate": 4.08068288610314e-05, "loss": 2.8004, "step": 886500 }, { "epoch": 1.8567672572596041, "grad_norm": 13.077528953552246, "learning_rate": 4.080158460454111e-05, "loss": 2.8149, "step": 887000 }, { "epoch": 1.8578139129852296, "grad_norm": 12.224275588989258, "learning_rate": 4.0796340348050816e-05, "loss": 2.8282, "step": 887500 }, { "epoch": 1.858860568710855, "grad_norm": 14.188151359558105, "learning_rate": 4.079109609156053e-05, "loss": 2.8251, "step": 888000 }, { "epoch": 1.8599072244364807, "grad_norm": 14.021940231323242, "learning_rate": 4.078585183507023e-05, "loss": 2.8123, "step": 888500 }, { "epoch": 1.8609538801621062, "grad_norm": 15.106484413146973, "learning_rate": 4.078060757857994e-05, "loss": 2.804, "step": 889000 }, { "epoch": 1.8620005358877316, "grad_norm": 13.178585052490234, "learning_rate": 4.0775363322089646e-05, "loss": 2.7993, "step": 889500 }, { "epoch": 1.863047191613357, "grad_norm": 17.768321990966797, "learning_rate": 4.077011906559936e-05, "loss": 2.8147, "step": 890000 }, { "epoch": 1.8640938473389825, "grad_norm": 17.242843627929688, "learning_rate": 4.076487480910907e-05, "loss": 2.83, "step": 890500 }, { "epoch": 1.865140503064608, "grad_norm": 14.699751853942871, "learning_rate": 4.075963055261877e-05, "loss": 2.803, "step": 891000 }, { "epoch": 1.8661871587902334, "grad_norm": 14.532470703125, "learning_rate": 4.0754386296128484e-05, "loss": 2.8331, "step": 891500 }, { "epoch": 1.867233814515859, "grad_norm": 14.192405700683594, "learning_rate": 4.074914203963819e-05, "loss": 2.8062, "step": 892000 }, { "epoch": 1.8682804702414844, "grad_norm": 18.571165084838867, "learning_rate": 4.07438977831479e-05, "loss": 2.8045, "step": 892500 }, { "epoch": 1.8693271259671098, "grad_norm": 13.1187105178833, "learning_rate": 4.0738653526657604e-05, "loss": 2.8049, "step": 893000 }, { "epoch": 1.8703737816927353, "grad_norm": 14.833242416381836, "learning_rate": 4.0733409270167315e-05, "loss": 2.8268, "step": 893500 }, { "epoch": 1.8714204374183607, "grad_norm": 17.261669158935547, "learning_rate": 4.072816501367702e-05, "loss": 2.8136, "step": 894000 }, { "epoch": 1.8724670931439862, "grad_norm": 15.158129692077637, "learning_rate": 4.072292075718673e-05, "loss": 2.8307, "step": 894500 }, { "epoch": 1.8735137488696119, "grad_norm": 21.870515823364258, "learning_rate": 4.071767650069644e-05, "loss": 2.8371, "step": 895000 }, { "epoch": 1.8745604045952373, "grad_norm": 18.185705184936523, "learning_rate": 4.071243224420615e-05, "loss": 2.8276, "step": 895500 }, { "epoch": 1.8756070603208628, "grad_norm": 12.639139175415039, "learning_rate": 4.0707187987715857e-05, "loss": 2.8109, "step": 896000 }, { "epoch": 1.8766537160464882, "grad_norm": 13.616013526916504, "learning_rate": 4.070194373122556e-05, "loss": 2.8102, "step": 896500 }, { "epoch": 1.877700371772114, "grad_norm": 14.052802085876465, "learning_rate": 4.069669947473527e-05, "loss": 2.8044, "step": 897000 }, { "epoch": 1.8787470274977394, "grad_norm": 15.063672065734863, "learning_rate": 4.0691455218244976e-05, "loss": 2.8087, "step": 897500 }, { "epoch": 1.8797936832233648, "grad_norm": 12.720634460449219, "learning_rate": 4.068621096175469e-05, "loss": 2.8195, "step": 898000 }, { "epoch": 1.8808403389489903, "grad_norm": 13.750849723815918, "learning_rate": 4.068096670526439e-05, "loss": 2.8072, "step": 898500 }, { "epoch": 1.8818869946746157, "grad_norm": 15.427648544311523, "learning_rate": 4.067572244877411e-05, "loss": 2.8208, "step": 899000 }, { "epoch": 1.8829336504002412, "grad_norm": 16.989784240722656, "learning_rate": 4.0670478192283814e-05, "loss": 2.7988, "step": 899500 }, { "epoch": 1.8839803061258666, "grad_norm": 18.24370765686035, "learning_rate": 4.0665233935793525e-05, "loss": 2.8296, "step": 900000 }, { "epoch": 1.885026961851492, "grad_norm": 14.449324607849121, "learning_rate": 4.065998967930323e-05, "loss": 2.8103, "step": 900500 }, { "epoch": 1.8860736175771176, "grad_norm": 12.794986724853516, "learning_rate": 4.065474542281294e-05, "loss": 2.8194, "step": 901000 }, { "epoch": 1.887120273302743, "grad_norm": 13.589138984680176, "learning_rate": 4.0649501166322645e-05, "loss": 2.8297, "step": 901500 }, { "epoch": 1.8881669290283685, "grad_norm": 12.782379150390625, "learning_rate": 4.064425690983235e-05, "loss": 2.8161, "step": 902000 }, { "epoch": 1.889213584753994, "grad_norm": 12.527366638183594, "learning_rate": 4.063901265334206e-05, "loss": 2.8118, "step": 902500 }, { "epoch": 1.8902602404796194, "grad_norm": 16.45027732849121, "learning_rate": 4.063376839685177e-05, "loss": 2.817, "step": 903000 }, { "epoch": 1.8913068962052448, "grad_norm": 12.332233428955078, "learning_rate": 4.062852414036148e-05, "loss": 2.8042, "step": 903500 }, { "epoch": 1.8923535519308705, "grad_norm": 12.411184310913086, "learning_rate": 4.0623279883871186e-05, "loss": 2.7928, "step": 904000 }, { "epoch": 1.893400207656496, "grad_norm": 14.123671531677246, "learning_rate": 4.06180356273809e-05, "loss": 2.8084, "step": 904500 }, { "epoch": 1.8944468633821214, "grad_norm": 15.588229179382324, "learning_rate": 4.06127913708906e-05, "loss": 2.8001, "step": 905000 }, { "epoch": 1.8954935191077469, "grad_norm": 13.51186752319336, "learning_rate": 4.060754711440031e-05, "loss": 2.8259, "step": 905500 }, { "epoch": 1.8965401748333726, "grad_norm": 13.274707794189453, "learning_rate": 4.060230285791002e-05, "loss": 2.8224, "step": 906000 }, { "epoch": 1.897586830558998, "grad_norm": 11.684950828552246, "learning_rate": 4.059705860141973e-05, "loss": 2.8233, "step": 906500 }, { "epoch": 1.8986334862846235, "grad_norm": 16.12947654724121, "learning_rate": 4.059181434492943e-05, "loss": 2.8138, "step": 907000 }, { "epoch": 1.899680142010249, "grad_norm": 35.25919723510742, "learning_rate": 4.0586570088439144e-05, "loss": 2.7824, "step": 907500 }, { "epoch": 1.9007267977358744, "grad_norm": 14.44081974029541, "learning_rate": 4.0581325831948855e-05, "loss": 2.8049, "step": 908000 }, { "epoch": 1.9017734534614998, "grad_norm": 13.153325080871582, "learning_rate": 4.057608157545856e-05, "loss": 2.8139, "step": 908500 }, { "epoch": 1.9028201091871253, "grad_norm": 14.818695068359375, "learning_rate": 4.057083731896827e-05, "loss": 2.7951, "step": 909000 }, { "epoch": 1.9038667649127508, "grad_norm": 13.608726501464844, "learning_rate": 4.0565593062477974e-05, "loss": 2.8149, "step": 909500 }, { "epoch": 1.9049134206383762, "grad_norm": 14.867524147033691, "learning_rate": 4.0560348805987685e-05, "loss": 2.7978, "step": 910000 }, { "epoch": 1.9059600763640017, "grad_norm": 12.654825210571289, "learning_rate": 4.055510454949739e-05, "loss": 2.7924, "step": 910500 }, { "epoch": 1.9070067320896271, "grad_norm": 21.14731216430664, "learning_rate": 4.05498602930071e-05, "loss": 2.8033, "step": 911000 }, { "epoch": 1.9080533878152526, "grad_norm": 12.69590950012207, "learning_rate": 4.0544616036516805e-05, "loss": 2.8182, "step": 911500 }, { "epoch": 1.909100043540878, "grad_norm": 11.944413185119629, "learning_rate": 4.0539371780026516e-05, "loss": 2.8211, "step": 912000 }, { "epoch": 1.9101466992665037, "grad_norm": 12.743996620178223, "learning_rate": 4.053412752353623e-05, "loss": 2.7934, "step": 912500 }, { "epoch": 1.9111933549921292, "grad_norm": 38.39024353027344, "learning_rate": 4.052888326704593e-05, "loss": 2.8141, "step": 913000 }, { "epoch": 1.9122400107177546, "grad_norm": 13.459626197814941, "learning_rate": 4.052363901055564e-05, "loss": 2.8013, "step": 913500 }, { "epoch": 1.91328666644338, "grad_norm": 13.598182678222656, "learning_rate": 4.051839475406535e-05, "loss": 2.8174, "step": 914000 }, { "epoch": 1.9143333221690058, "grad_norm": 12.525984764099121, "learning_rate": 4.051315049757506e-05, "loss": 2.7865, "step": 914500 }, { "epoch": 1.9153799778946312, "grad_norm": 12.69699764251709, "learning_rate": 4.050790624108476e-05, "loss": 2.8002, "step": 915000 }, { "epoch": 1.9164266336202567, "grad_norm": 14.339705467224121, "learning_rate": 4.0502661984594473e-05, "loss": 2.7941, "step": 915500 }, { "epoch": 1.9174732893458821, "grad_norm": 16.859460830688477, "learning_rate": 4.049741772810418e-05, "loss": 2.7952, "step": 916000 }, { "epoch": 1.9185199450715076, "grad_norm": 14.725451469421387, "learning_rate": 4.0492173471613896e-05, "loss": 2.8025, "step": 916500 }, { "epoch": 1.919566600797133, "grad_norm": 14.965503692626953, "learning_rate": 4.04869292151236e-05, "loss": 2.8196, "step": 917000 }, { "epoch": 1.9206132565227585, "grad_norm": 15.526006698608398, "learning_rate": 4.048168495863331e-05, "loss": 2.7925, "step": 917500 }, { "epoch": 1.921659912248384, "grad_norm": 15.660480499267578, "learning_rate": 4.0476440702143015e-05, "loss": 2.7709, "step": 918000 }, { "epoch": 1.9227065679740094, "grad_norm": 11.787469863891602, "learning_rate": 4.047119644565272e-05, "loss": 2.8118, "step": 918500 }, { "epoch": 1.9237532236996349, "grad_norm": 14.205131530761719, "learning_rate": 4.046595218916243e-05, "loss": 2.8111, "step": 919000 }, { "epoch": 1.9247998794252603, "grad_norm": 13.828117370605469, "learning_rate": 4.0460707932672135e-05, "loss": 2.8031, "step": 919500 }, { "epoch": 1.9258465351508858, "grad_norm": 34.88142776489258, "learning_rate": 4.0455463676181846e-05, "loss": 2.7913, "step": 920000 }, { "epoch": 1.9268931908765112, "grad_norm": 12.356874465942383, "learning_rate": 4.045021941969156e-05, "loss": 2.814, "step": 920500 }, { "epoch": 1.9279398466021367, "grad_norm": 12.09486198425293, "learning_rate": 4.044497516320127e-05, "loss": 2.7988, "step": 921000 }, { "epoch": 1.9289865023277624, "grad_norm": 11.829706192016602, "learning_rate": 4.043973090671097e-05, "loss": 2.8085, "step": 921500 }, { "epoch": 1.9300331580533878, "grad_norm": 11.537567138671875, "learning_rate": 4.0434486650220684e-05, "loss": 2.7741, "step": 922000 }, { "epoch": 1.9310798137790133, "grad_norm": 13.438529968261719, "learning_rate": 4.042924239373039e-05, "loss": 2.8104, "step": 922500 }, { "epoch": 1.9321264695046387, "grad_norm": 16.869220733642578, "learning_rate": 4.04239981372401e-05, "loss": 2.7992, "step": 923000 }, { "epoch": 1.9331731252302644, "grad_norm": 14.138172149658203, "learning_rate": 4.04187538807498e-05, "loss": 2.8055, "step": 923500 }, { "epoch": 1.9342197809558899, "grad_norm": 14.01526927947998, "learning_rate": 4.041350962425951e-05, "loss": 2.8026, "step": 924000 }, { "epoch": 1.9352664366815153, "grad_norm": 13.156599044799805, "learning_rate": 4.040826536776922e-05, "loss": 2.7841, "step": 924500 }, { "epoch": 1.9363130924071408, "grad_norm": 12.617264747619629, "learning_rate": 4.040302111127893e-05, "loss": 2.78, "step": 925000 }, { "epoch": 1.9373597481327662, "grad_norm": 11.448747634887695, "learning_rate": 4.039777685478864e-05, "loss": 2.8108, "step": 925500 }, { "epoch": 1.9384064038583917, "grad_norm": 15.711140632629395, "learning_rate": 4.0392532598298345e-05, "loss": 2.7702, "step": 926000 }, { "epoch": 1.9394530595840171, "grad_norm": 14.621039390563965, "learning_rate": 4.0387288341808056e-05, "loss": 2.7925, "step": 926500 }, { "epoch": 1.9404997153096426, "grad_norm": 15.497955322265625, "learning_rate": 4.038204408531776e-05, "loss": 2.7785, "step": 927000 }, { "epoch": 1.941546371035268, "grad_norm": 12.06850814819336, "learning_rate": 4.037679982882747e-05, "loss": 2.7966, "step": 927500 }, { "epoch": 1.9425930267608935, "grad_norm": 14.102771759033203, "learning_rate": 4.0371555572337176e-05, "loss": 2.7867, "step": 928000 }, { "epoch": 1.943639682486519, "grad_norm": 106.06127166748047, "learning_rate": 4.036631131584689e-05, "loss": 2.8103, "step": 928500 }, { "epoch": 1.9446863382121444, "grad_norm": 11.389826774597168, "learning_rate": 4.036106705935659e-05, "loss": 2.794, "step": 929000 }, { "epoch": 1.9457329939377699, "grad_norm": 14.599651336669922, "learning_rate": 4.03558228028663e-05, "loss": 2.7993, "step": 929500 }, { "epoch": 1.9467796496633956, "grad_norm": 18.320362091064453, "learning_rate": 4.0350578546376013e-05, "loss": 2.7883, "step": 930000 }, { "epoch": 1.947826305389021, "grad_norm": 12.398736000061035, "learning_rate": 4.034533428988572e-05, "loss": 2.7725, "step": 930500 }, { "epoch": 1.9488729611146465, "grad_norm": 25.370655059814453, "learning_rate": 4.034009003339543e-05, "loss": 2.7738, "step": 931000 }, { "epoch": 1.949919616840272, "grad_norm": 23.70241355895996, "learning_rate": 4.033484577690513e-05, "loss": 2.799, "step": 931500 }, { "epoch": 1.9509662725658976, "grad_norm": 12.43274974822998, "learning_rate": 4.0329601520414844e-05, "loss": 2.8058, "step": 932000 }, { "epoch": 1.952012928291523, "grad_norm": 16.869647979736328, "learning_rate": 4.032435726392455e-05, "loss": 2.7976, "step": 932500 }, { "epoch": 1.9530595840171485, "grad_norm": 18.193910598754883, "learning_rate": 4.031911300743426e-05, "loss": 2.7916, "step": 933000 }, { "epoch": 1.954106239742774, "grad_norm": 11.985641479492188, "learning_rate": 4.0313868750943964e-05, "loss": 2.807, "step": 933500 }, { "epoch": 1.9551528954683994, "grad_norm": 25.80848503112793, "learning_rate": 4.030862449445368e-05, "loss": 2.7873, "step": 934000 }, { "epoch": 1.9561995511940249, "grad_norm": 14.410115242004395, "learning_rate": 4.0303380237963386e-05, "loss": 2.7697, "step": 934500 }, { "epoch": 1.9572462069196503, "grad_norm": 12.826255798339844, "learning_rate": 4.02981359814731e-05, "loss": 2.8074, "step": 935000 }, { "epoch": 1.9582928626452758, "grad_norm": 15.460027694702148, "learning_rate": 4.02928917249828e-05, "loss": 2.7777, "step": 935500 }, { "epoch": 1.9593395183709013, "grad_norm": 15.37759017944336, "learning_rate": 4.0287647468492506e-05, "loss": 2.8101, "step": 936000 }, { "epoch": 1.9603861740965267, "grad_norm": 12.849650382995605, "learning_rate": 4.028240321200222e-05, "loss": 2.785, "step": 936500 }, { "epoch": 1.9614328298221522, "grad_norm": 12.814047813415527, "learning_rate": 4.027715895551192e-05, "loss": 2.7775, "step": 937000 }, { "epoch": 1.9624794855477776, "grad_norm": 17.629159927368164, "learning_rate": 4.027191469902163e-05, "loss": 2.8116, "step": 937500 }, { "epoch": 1.963526141273403, "grad_norm": 18.39391326904297, "learning_rate": 4.026667044253134e-05, "loss": 2.8059, "step": 938000 }, { "epoch": 1.9645727969990288, "grad_norm": 18.1369571685791, "learning_rate": 4.0261426186041054e-05, "loss": 2.7853, "step": 938500 }, { "epoch": 1.9656194527246542, "grad_norm": 12.657309532165527, "learning_rate": 4.025618192955076e-05, "loss": 2.772, "step": 939000 }, { "epoch": 1.9666661084502797, "grad_norm": 13.8775053024292, "learning_rate": 4.025093767306047e-05, "loss": 2.796, "step": 939500 }, { "epoch": 1.9677127641759051, "grad_norm": 13.934861183166504, "learning_rate": 4.0245693416570174e-05, "loss": 2.7959, "step": 940000 }, { "epoch": 1.9687594199015306, "grad_norm": 24.936124801635742, "learning_rate": 4.0240449160079885e-05, "loss": 2.7623, "step": 940500 }, { "epoch": 1.9698060756271563, "grad_norm": 15.196057319641113, "learning_rate": 4.023520490358959e-05, "loss": 2.7926, "step": 941000 }, { "epoch": 1.9708527313527817, "grad_norm": 18.060224533081055, "learning_rate": 4.0229960647099294e-05, "loss": 2.8048, "step": 941500 }, { "epoch": 1.9718993870784072, "grad_norm": 14.796953201293945, "learning_rate": 4.0224716390609005e-05, "loss": 2.7993, "step": 942000 }, { "epoch": 1.9729460428040326, "grad_norm": 22.65426254272461, "learning_rate": 4.0219472134118716e-05, "loss": 2.8004, "step": 942500 }, { "epoch": 1.973992698529658, "grad_norm": 71.09022521972656, "learning_rate": 4.021422787762843e-05, "loss": 2.817, "step": 943000 }, { "epoch": 1.9750393542552835, "grad_norm": 30.859981536865234, "learning_rate": 4.020898362113813e-05, "loss": 2.7816, "step": 943500 }, { "epoch": 1.976086009980909, "grad_norm": 16.114221572875977, "learning_rate": 4.020373936464784e-05, "loss": 2.8035, "step": 944000 }, { "epoch": 1.9771326657065345, "grad_norm": 13.964943885803223, "learning_rate": 4.019849510815755e-05, "loss": 2.7973, "step": 944500 }, { "epoch": 1.97817932143216, "grad_norm": 19.380346298217773, "learning_rate": 4.019325085166726e-05, "loss": 2.8027, "step": 945000 }, { "epoch": 1.9792259771577854, "grad_norm": 13.238539695739746, "learning_rate": 4.018800659517696e-05, "loss": 2.8131, "step": 945500 }, { "epoch": 1.9802726328834108, "grad_norm": 16.382652282714844, "learning_rate": 4.018276233868667e-05, "loss": 2.7921, "step": 946000 }, { "epoch": 1.9813192886090363, "grad_norm": 11.578733444213867, "learning_rate": 4.017751808219638e-05, "loss": 2.7913, "step": 946500 }, { "epoch": 1.9823659443346617, "grad_norm": 15.196420669555664, "learning_rate": 4.017227382570609e-05, "loss": 2.8094, "step": 947000 }, { "epoch": 1.9834126000602874, "grad_norm": 18.739805221557617, "learning_rate": 4.01670295692158e-05, "loss": 2.7888, "step": 947500 }, { "epoch": 1.9844592557859129, "grad_norm": 24.495634078979492, "learning_rate": 4.0161785312725504e-05, "loss": 2.7865, "step": 948000 }, { "epoch": 1.9855059115115383, "grad_norm": 12.805275917053223, "learning_rate": 4.0156541056235215e-05, "loss": 2.8093, "step": 948500 }, { "epoch": 1.9865525672371638, "grad_norm": 13.688287734985352, "learning_rate": 4.015129679974492e-05, "loss": 2.7901, "step": 949000 }, { "epoch": 1.9875992229627895, "grad_norm": 13.000855445861816, "learning_rate": 4.014605254325463e-05, "loss": 2.79, "step": 949500 }, { "epoch": 1.988645878688415, "grad_norm": 12.559382438659668, "learning_rate": 4.0140808286764335e-05, "loss": 2.7936, "step": 950000 }, { "epoch": 1.9896925344140404, "grad_norm": 12.655092239379883, "learning_rate": 4.0135564030274046e-05, "loss": 2.8102, "step": 950500 }, { "epoch": 1.9907391901396658, "grad_norm": 13.971698760986328, "learning_rate": 4.013031977378375e-05, "loss": 2.8013, "step": 951000 }, { "epoch": 1.9917858458652913, "grad_norm": 14.745405197143555, "learning_rate": 4.012507551729346e-05, "loss": 2.7992, "step": 951500 }, { "epoch": 1.9928325015909167, "grad_norm": 14.377123832702637, "learning_rate": 4.011983126080317e-05, "loss": 2.7867, "step": 952000 }, { "epoch": 1.9938791573165422, "grad_norm": 13.77749252319336, "learning_rate": 4.0114587004312876e-05, "loss": 2.7973, "step": 952500 }, { "epoch": 1.9949258130421677, "grad_norm": 15.24754524230957, "learning_rate": 4.010934274782259e-05, "loss": 2.7937, "step": 953000 }, { "epoch": 1.995972468767793, "grad_norm": 14.357891082763672, "learning_rate": 4.010409849133229e-05, "loss": 2.8112, "step": 953500 }, { "epoch": 1.9970191244934186, "grad_norm": 12.995698928833008, "learning_rate": 4.0098854234842e-05, "loss": 2.7941, "step": 954000 }, { "epoch": 1.998065780219044, "grad_norm": 38.09659957885742, "learning_rate": 4.009360997835171e-05, "loss": 2.789, "step": 954500 }, { "epoch": 1.9991124359446695, "grad_norm": 13.227173805236816, "learning_rate": 4.008836572186142e-05, "loss": 2.789, "step": 955000 }, { "epoch": 2.000159091670295, "grad_norm": 13.349320411682129, "learning_rate": 4.008312146537113e-05, "loss": 2.7759, "step": 955500 }, { "epoch": 2.0012057473959204, "grad_norm": 17.14556312561035, "learning_rate": 4.007787720888084e-05, "loss": 2.7851, "step": 956000 }, { "epoch": 2.002252403121546, "grad_norm": 14.20517349243164, "learning_rate": 4.0072632952390545e-05, "loss": 2.7699, "step": 956500 }, { "epoch": 2.0032990588471713, "grad_norm": 13.569400787353516, "learning_rate": 4.0067388695900256e-05, "loss": 2.7829, "step": 957000 }, { "epoch": 2.004345714572797, "grad_norm": 13.011297225952148, "learning_rate": 4.006214443940996e-05, "loss": 2.8039, "step": 957500 }, { "epoch": 2.0053923702984227, "grad_norm": 12.285758972167969, "learning_rate": 4.005690018291967e-05, "loss": 2.7945, "step": 958000 }, { "epoch": 2.006439026024048, "grad_norm": 12.09470272064209, "learning_rate": 4.0051655926429376e-05, "loss": 2.7823, "step": 958500 }, { "epoch": 2.0074856817496736, "grad_norm": 17.365373611450195, "learning_rate": 4.004641166993908e-05, "loss": 2.7996, "step": 959000 }, { "epoch": 2.008532337475299, "grad_norm": 14.123476028442383, "learning_rate": 4.004116741344879e-05, "loss": 2.7884, "step": 959500 }, { "epoch": 2.0095789932009245, "grad_norm": 16.086605072021484, "learning_rate": 4.00359231569585e-05, "loss": 2.7802, "step": 960000 }, { "epoch": 2.01062564892655, "grad_norm": 15.181049346923828, "learning_rate": 4.003067890046821e-05, "loss": 2.7872, "step": 960500 }, { "epoch": 2.0116723046521754, "grad_norm": 12.625993728637695, "learning_rate": 4.002543464397792e-05, "loss": 2.7778, "step": 961000 }, { "epoch": 2.012718960377801, "grad_norm": 14.951041221618652, "learning_rate": 4.002019038748763e-05, "loss": 2.7726, "step": 961500 }, { "epoch": 2.0137656161034263, "grad_norm": 13.743858337402344, "learning_rate": 4.001494613099733e-05, "loss": 2.7791, "step": 962000 }, { "epoch": 2.0148122718290518, "grad_norm": 12.758158683776855, "learning_rate": 4.0009701874507044e-05, "loss": 2.7855, "step": 962500 }, { "epoch": 2.015858927554677, "grad_norm": 13.236380577087402, "learning_rate": 4.000445761801675e-05, "loss": 2.7881, "step": 963000 }, { "epoch": 2.0169055832803027, "grad_norm": 14.955841064453125, "learning_rate": 3.999921336152646e-05, "loss": 2.7821, "step": 963500 }, { "epoch": 2.017952239005928, "grad_norm": 13.335437774658203, "learning_rate": 3.9993969105036164e-05, "loss": 2.7698, "step": 964000 }, { "epoch": 2.0189988947315536, "grad_norm": 13.164236068725586, "learning_rate": 3.9988724848545875e-05, "loss": 2.7598, "step": 964500 }, { "epoch": 2.020045550457179, "grad_norm": 12.902059555053711, "learning_rate": 3.9983480592055586e-05, "loss": 2.7851, "step": 965000 }, { "epoch": 2.0210922061828045, "grad_norm": 13.878458023071289, "learning_rate": 3.997823633556529e-05, "loss": 2.7605, "step": 965500 }, { "epoch": 2.0221388619084304, "grad_norm": 11.858596801757812, "learning_rate": 3.9972992079075e-05, "loss": 2.7702, "step": 966000 }, { "epoch": 2.023185517634056, "grad_norm": 14.871109008789062, "learning_rate": 3.9967747822584705e-05, "loss": 2.7811, "step": 966500 }, { "epoch": 2.0242321733596813, "grad_norm": 16.11600685119629, "learning_rate": 3.9962503566094416e-05, "loss": 2.777, "step": 967000 }, { "epoch": 2.0252788290853068, "grad_norm": 14.908331871032715, "learning_rate": 3.995725930960412e-05, "loss": 2.7803, "step": 967500 }, { "epoch": 2.026325484810932, "grad_norm": 14.113607406616211, "learning_rate": 3.995201505311383e-05, "loss": 2.7804, "step": 968000 }, { "epoch": 2.0273721405365577, "grad_norm": 24.095142364501953, "learning_rate": 3.9946770796623536e-05, "loss": 2.7825, "step": 968500 }, { "epoch": 2.028418796262183, "grad_norm": 23.331741333007812, "learning_rate": 3.994152654013325e-05, "loss": 2.768, "step": 969000 }, { "epoch": 2.0294654519878086, "grad_norm": 12.044859886169434, "learning_rate": 3.993628228364296e-05, "loss": 2.7847, "step": 969500 }, { "epoch": 2.030512107713434, "grad_norm": 16.766557693481445, "learning_rate": 3.993103802715266e-05, "loss": 2.801, "step": 970000 }, { "epoch": 2.0315587634390595, "grad_norm": 12.889524459838867, "learning_rate": 3.9925793770662374e-05, "loss": 2.7682, "step": 970500 }, { "epoch": 2.032605419164685, "grad_norm": 11.674165725708008, "learning_rate": 3.992054951417208e-05, "loss": 2.7787, "step": 971000 }, { "epoch": 2.0336520748903104, "grad_norm": 16.213899612426758, "learning_rate": 3.991530525768179e-05, "loss": 2.7799, "step": 971500 }, { "epoch": 2.034698730615936, "grad_norm": 16.2750186920166, "learning_rate": 3.9910061001191493e-05, "loss": 2.7825, "step": 972000 }, { "epoch": 2.0357453863415613, "grad_norm": 12.405789375305176, "learning_rate": 3.9904816744701204e-05, "loss": 2.7841, "step": 972500 }, { "epoch": 2.036792042067187, "grad_norm": 14.215045928955078, "learning_rate": 3.9899572488210916e-05, "loss": 2.7759, "step": 973000 }, { "epoch": 2.0378386977928122, "grad_norm": 18.397592544555664, "learning_rate": 3.989432823172063e-05, "loss": 2.768, "step": 973500 }, { "epoch": 2.0388853535184377, "grad_norm": 13.590423583984375, "learning_rate": 3.988908397523033e-05, "loss": 2.7694, "step": 974000 }, { "epoch": 2.0399320092440636, "grad_norm": 14.872385025024414, "learning_rate": 3.988383971874004e-05, "loss": 2.7841, "step": 974500 }, { "epoch": 2.040978664969689, "grad_norm": 16.80933380126953, "learning_rate": 3.9878595462249746e-05, "loss": 2.8027, "step": 975000 }, { "epoch": 2.0420253206953145, "grad_norm": 13.350442886352539, "learning_rate": 3.987335120575945e-05, "loss": 2.784, "step": 975500 }, { "epoch": 2.04307197642094, "grad_norm": 12.390050888061523, "learning_rate": 3.986810694926916e-05, "loss": 2.784, "step": 976000 }, { "epoch": 2.0441186321465654, "grad_norm": 11.606134414672852, "learning_rate": 3.9862862692778866e-05, "loss": 2.7685, "step": 976500 }, { "epoch": 2.045165287872191, "grad_norm": 13.878583908081055, "learning_rate": 3.985761843628858e-05, "loss": 2.7712, "step": 977000 }, { "epoch": 2.0462119435978163, "grad_norm": 14.922723770141602, "learning_rate": 3.985237417979829e-05, "loss": 2.7891, "step": 977500 }, { "epoch": 2.047258599323442, "grad_norm": 13.3042631149292, "learning_rate": 3.9847129923308e-05, "loss": 2.7812, "step": 978000 }, { "epoch": 2.0483052550490672, "grad_norm": 13.511171340942383, "learning_rate": 3.9841885666817704e-05, "loss": 2.7811, "step": 978500 }, { "epoch": 2.0493519107746927, "grad_norm": 14.065692901611328, "learning_rate": 3.9836641410327415e-05, "loss": 2.7751, "step": 979000 }, { "epoch": 2.050398566500318, "grad_norm": 12.604851722717285, "learning_rate": 3.983139715383712e-05, "loss": 2.7685, "step": 979500 }, { "epoch": 2.0514452222259436, "grad_norm": 17.3243465423584, "learning_rate": 3.982615289734683e-05, "loss": 2.7883, "step": 980000 }, { "epoch": 2.052491877951569, "grad_norm": 13.292418479919434, "learning_rate": 3.9820908640856534e-05, "loss": 2.765, "step": 980500 }, { "epoch": 2.0535385336771945, "grad_norm": 13.020552635192871, "learning_rate": 3.981566438436624e-05, "loss": 2.7707, "step": 981000 }, { "epoch": 2.05458518940282, "grad_norm": 13.216552734375, "learning_rate": 3.981042012787595e-05, "loss": 2.7904, "step": 981500 }, { "epoch": 2.0556318451284454, "grad_norm": 16.05988121032715, "learning_rate": 3.980517587138566e-05, "loss": 2.7872, "step": 982000 }, { "epoch": 2.056678500854071, "grad_norm": 16.655826568603516, "learning_rate": 3.979993161489537e-05, "loss": 2.7761, "step": 982500 }, { "epoch": 2.0577251565796963, "grad_norm": 23.78839111328125, "learning_rate": 3.9794687358405076e-05, "loss": 2.809, "step": 983000 }, { "epoch": 2.0587718123053222, "grad_norm": 13.03066349029541, "learning_rate": 3.978944310191479e-05, "loss": 2.7807, "step": 983500 }, { "epoch": 2.0598184680309477, "grad_norm": 15.365605354309082, "learning_rate": 3.978419884542449e-05, "loss": 2.7887, "step": 984000 }, { "epoch": 2.060865123756573, "grad_norm": 17.080923080444336, "learning_rate": 3.97789545889342e-05, "loss": 2.7792, "step": 984500 }, { "epoch": 2.0619117794821986, "grad_norm": 22.433088302612305, "learning_rate": 3.977371033244391e-05, "loss": 2.7934, "step": 985000 }, { "epoch": 2.062958435207824, "grad_norm": 13.880253791809082, "learning_rate": 3.976846607595362e-05, "loss": 2.7749, "step": 985500 }, { "epoch": 2.0640050909334495, "grad_norm": 15.423989295959473, "learning_rate": 3.976322181946332e-05, "loss": 2.8109, "step": 986000 }, { "epoch": 2.065051746659075, "grad_norm": 17.903024673461914, "learning_rate": 3.975797756297303e-05, "loss": 2.7798, "step": 986500 }, { "epoch": 2.0660984023847004, "grad_norm": 12.643593788146973, "learning_rate": 3.9752733306482744e-05, "loss": 2.7832, "step": 987000 }, { "epoch": 2.067145058110326, "grad_norm": 15.828659057617188, "learning_rate": 3.974748904999245e-05, "loss": 2.8122, "step": 987500 }, { "epoch": 2.0681917138359514, "grad_norm": 14.299464225769043, "learning_rate": 3.974224479350216e-05, "loss": 2.7691, "step": 988000 }, { "epoch": 2.069238369561577, "grad_norm": 12.995485305786133, "learning_rate": 3.9737000537011864e-05, "loss": 2.7756, "step": 988500 }, { "epoch": 2.0702850252872023, "grad_norm": 46.95344161987305, "learning_rate": 3.9731756280521575e-05, "loss": 2.789, "step": 989000 }, { "epoch": 2.0713316810128277, "grad_norm": 14.338959693908691, "learning_rate": 3.972651202403128e-05, "loss": 2.7798, "step": 989500 }, { "epoch": 2.072378336738453, "grad_norm": 14.55030632019043, "learning_rate": 3.972126776754099e-05, "loss": 2.7608, "step": 990000 }, { "epoch": 2.0734249924640786, "grad_norm": 13.54639720916748, "learning_rate": 3.97160235110507e-05, "loss": 2.7655, "step": 990500 }, { "epoch": 2.074471648189704, "grad_norm": 12.245502471923828, "learning_rate": 3.971077925456041e-05, "loss": 2.7797, "step": 991000 }, { "epoch": 2.0755183039153295, "grad_norm": 13.260289192199707, "learning_rate": 3.970553499807012e-05, "loss": 2.7795, "step": 991500 }, { "epoch": 2.0765649596409554, "grad_norm": 12.343256950378418, "learning_rate": 3.970029074157983e-05, "loss": 2.7567, "step": 992000 }, { "epoch": 2.077611615366581, "grad_norm": 12.577105522155762, "learning_rate": 3.969504648508953e-05, "loss": 2.7726, "step": 992500 }, { "epoch": 2.0786582710922064, "grad_norm": 13.586813926696777, "learning_rate": 3.968980222859924e-05, "loss": 2.7968, "step": 993000 }, { "epoch": 2.079704926817832, "grad_norm": 13.10605239868164, "learning_rate": 3.968455797210895e-05, "loss": 2.7775, "step": 993500 }, { "epoch": 2.0807515825434573, "grad_norm": 11.964317321777344, "learning_rate": 3.967931371561865e-05, "loss": 2.772, "step": 994000 }, { "epoch": 2.0817982382690827, "grad_norm": 12.820301055908203, "learning_rate": 3.967406945912836e-05, "loss": 2.7767, "step": 994500 }, { "epoch": 2.082844893994708, "grad_norm": 15.513683319091797, "learning_rate": 3.9668825202638074e-05, "loss": 2.7832, "step": 995000 }, { "epoch": 2.0838915497203336, "grad_norm": 35.288818359375, "learning_rate": 3.9663580946147785e-05, "loss": 2.7711, "step": 995500 }, { "epoch": 2.084938205445959, "grad_norm": 11.763365745544434, "learning_rate": 3.965833668965749e-05, "loss": 2.7841, "step": 996000 }, { "epoch": 2.0859848611715845, "grad_norm": 14.163705825805664, "learning_rate": 3.96530924331672e-05, "loss": 2.7902, "step": 996500 }, { "epoch": 2.08703151689721, "grad_norm": 15.498322486877441, "learning_rate": 3.9647848176676905e-05, "loss": 2.7638, "step": 997000 }, { "epoch": 2.0880781726228355, "grad_norm": 11.77759838104248, "learning_rate": 3.9642603920186616e-05, "loss": 2.769, "step": 997500 }, { "epoch": 2.089124828348461, "grad_norm": 11.633956909179688, "learning_rate": 3.963735966369632e-05, "loss": 2.7806, "step": 998000 }, { "epoch": 2.0901714840740864, "grad_norm": 13.34634017944336, "learning_rate": 3.9632115407206025e-05, "loss": 2.762, "step": 998500 }, { "epoch": 2.091218139799712, "grad_norm": 25.806093215942383, "learning_rate": 3.9626871150715736e-05, "loss": 2.7876, "step": 999000 }, { "epoch": 2.0922647955253373, "grad_norm": 13.141804695129395, "learning_rate": 3.962162689422545e-05, "loss": 2.7759, "step": 999500 }, { "epoch": 2.0933114512509627, "grad_norm": 15.774669647216797, "learning_rate": 3.961638263773516e-05, "loss": 2.8036, "step": 1000000 }, { "epoch": 2.094358106976588, "grad_norm": 12.371857643127441, "learning_rate": 3.961113838124486e-05, "loss": 2.7725, "step": 1000500 }, { "epoch": 2.095404762702214, "grad_norm": 16.05453109741211, "learning_rate": 3.960589412475457e-05, "loss": 2.7731, "step": 1001000 }, { "epoch": 2.0964514184278396, "grad_norm": 25.279430389404297, "learning_rate": 3.960064986826428e-05, "loss": 2.7694, "step": 1001500 }, { "epoch": 2.097498074153465, "grad_norm": 12.539361953735352, "learning_rate": 3.959540561177399e-05, "loss": 2.7803, "step": 1002000 }, { "epoch": 2.0985447298790905, "grad_norm": 14.719816207885742, "learning_rate": 3.959016135528369e-05, "loss": 2.7705, "step": 1002500 }, { "epoch": 2.099591385604716, "grad_norm": 66.86105346679688, "learning_rate": 3.9584917098793404e-05, "loss": 2.7793, "step": 1003000 }, { "epoch": 2.1006380413303414, "grad_norm": 11.268272399902344, "learning_rate": 3.957967284230311e-05, "loss": 2.773, "step": 1003500 }, { "epoch": 2.101684697055967, "grad_norm": 13.657160758972168, "learning_rate": 3.957442858581282e-05, "loss": 2.7849, "step": 1004000 }, { "epoch": 2.1027313527815923, "grad_norm": 13.481696128845215, "learning_rate": 3.956918432932253e-05, "loss": 2.7914, "step": 1004500 }, { "epoch": 2.1037780085072177, "grad_norm": 13.79433822631836, "learning_rate": 3.9563940072832235e-05, "loss": 2.7593, "step": 1005000 }, { "epoch": 2.104824664232843, "grad_norm": 15.37031078338623, "learning_rate": 3.9558695816341946e-05, "loss": 2.7673, "step": 1005500 }, { "epoch": 2.1058713199584687, "grad_norm": 16.721546173095703, "learning_rate": 3.955345155985165e-05, "loss": 2.7591, "step": 1006000 }, { "epoch": 2.106917975684094, "grad_norm": 15.090466499328613, "learning_rate": 3.954820730336136e-05, "loss": 2.7602, "step": 1006500 }, { "epoch": 2.1079646314097196, "grad_norm": 10.94198989868164, "learning_rate": 3.9542963046871066e-05, "loss": 2.7716, "step": 1007000 }, { "epoch": 2.109011287135345, "grad_norm": 16.527729034423828, "learning_rate": 3.953771879038078e-05, "loss": 2.7532, "step": 1007500 }, { "epoch": 2.1100579428609705, "grad_norm": 12.024646759033203, "learning_rate": 3.953247453389049e-05, "loss": 2.7827, "step": 1008000 }, { "epoch": 2.111104598586596, "grad_norm": 14.727720260620117, "learning_rate": 3.95272302774002e-05, "loss": 2.7625, "step": 1008500 }, { "epoch": 2.1121512543122214, "grad_norm": 17.092119216918945, "learning_rate": 3.95219860209099e-05, "loss": 2.7629, "step": 1009000 }, { "epoch": 2.1131979100378473, "grad_norm": 14.936176300048828, "learning_rate": 3.951674176441961e-05, "loss": 2.7792, "step": 1009500 }, { "epoch": 2.1142445657634727, "grad_norm": 19.679704666137695, "learning_rate": 3.951149750792932e-05, "loss": 2.8003, "step": 1010000 }, { "epoch": 2.115291221489098, "grad_norm": 13.503580093383789, "learning_rate": 3.950625325143902e-05, "loss": 2.7992, "step": 1010500 }, { "epoch": 2.1163378772147237, "grad_norm": 13.536111831665039, "learning_rate": 3.9501008994948734e-05, "loss": 2.7847, "step": 1011000 }, { "epoch": 2.117384532940349, "grad_norm": 13.749455451965332, "learning_rate": 3.949576473845844e-05, "loss": 2.7617, "step": 1011500 }, { "epoch": 2.1184311886659746, "grad_norm": 13.142685890197754, "learning_rate": 3.949052048196815e-05, "loss": 2.7518, "step": 1012000 }, { "epoch": 2.1194778443916, "grad_norm": 12.107606887817383, "learning_rate": 3.948527622547786e-05, "loss": 2.7761, "step": 1012500 }, { "epoch": 2.1205245001172255, "grad_norm": 11.727561950683594, "learning_rate": 3.948003196898757e-05, "loss": 2.7809, "step": 1013000 }, { "epoch": 2.121571155842851, "grad_norm": 11.718313217163086, "learning_rate": 3.9474787712497276e-05, "loss": 2.7793, "step": 1013500 }, { "epoch": 2.1226178115684764, "grad_norm": 14.149621963500977, "learning_rate": 3.946954345600699e-05, "loss": 2.7931, "step": 1014000 }, { "epoch": 2.123664467294102, "grad_norm": 15.547350883483887, "learning_rate": 3.946429919951669e-05, "loss": 2.7787, "step": 1014500 }, { "epoch": 2.1247111230197273, "grad_norm": 12.325373649597168, "learning_rate": 3.9459054943026395e-05, "loss": 2.7718, "step": 1015000 }, { "epoch": 2.1257577787453528, "grad_norm": 14.729299545288086, "learning_rate": 3.9453810686536107e-05, "loss": 2.7661, "step": 1015500 }, { "epoch": 2.1268044344709782, "grad_norm": 14.352572441101074, "learning_rate": 3.944856643004581e-05, "loss": 2.768, "step": 1016000 }, { "epoch": 2.1278510901966037, "grad_norm": 14.023016929626465, "learning_rate": 3.944332217355552e-05, "loss": 2.769, "step": 1016500 }, { "epoch": 2.128897745922229, "grad_norm": 14.406769752502441, "learning_rate": 3.943807791706523e-05, "loss": 2.7433, "step": 1017000 }, { "epoch": 2.1299444016478546, "grad_norm": 13.825115203857422, "learning_rate": 3.9432833660574944e-05, "loss": 2.7636, "step": 1017500 }, { "epoch": 2.13099105737348, "grad_norm": 14.939013481140137, "learning_rate": 3.942758940408465e-05, "loss": 2.7721, "step": 1018000 }, { "epoch": 2.1320377130991055, "grad_norm": 12.283696174621582, "learning_rate": 3.942234514759436e-05, "loss": 2.7781, "step": 1018500 }, { "epoch": 2.1330843688247314, "grad_norm": 14.37260627746582, "learning_rate": 3.9417100891104064e-05, "loss": 2.749, "step": 1019000 }, { "epoch": 2.134131024550357, "grad_norm": 12.341817855834961, "learning_rate": 3.9411856634613775e-05, "loss": 2.7668, "step": 1019500 }, { "epoch": 2.1351776802759823, "grad_norm": 14.737007141113281, "learning_rate": 3.940661237812348e-05, "loss": 2.7391, "step": 1020000 }, { "epoch": 2.1362243360016078, "grad_norm": 16.45636749267578, "learning_rate": 3.9401368121633183e-05, "loss": 2.7586, "step": 1020500 }, { "epoch": 2.1372709917272332, "grad_norm": 40.82760238647461, "learning_rate": 3.9396123865142895e-05, "loss": 2.7644, "step": 1021000 }, { "epoch": 2.1383176474528587, "grad_norm": 15.691339492797852, "learning_rate": 3.9390879608652606e-05, "loss": 2.7588, "step": 1021500 }, { "epoch": 2.139364303178484, "grad_norm": 13.09339714050293, "learning_rate": 3.938563535216232e-05, "loss": 2.7839, "step": 1022000 }, { "epoch": 2.1404109589041096, "grad_norm": 15.30900764465332, "learning_rate": 3.938039109567202e-05, "loss": 2.7575, "step": 1022500 }, { "epoch": 2.141457614629735, "grad_norm": 18.692102432250977, "learning_rate": 3.937514683918173e-05, "loss": 2.7489, "step": 1023000 }, { "epoch": 2.1425042703553605, "grad_norm": 89.4323501586914, "learning_rate": 3.9369902582691436e-05, "loss": 2.7719, "step": 1023500 }, { "epoch": 2.143550926080986, "grad_norm": 13.620752334594727, "learning_rate": 3.936465832620115e-05, "loss": 2.754, "step": 1024000 }, { "epoch": 2.1445975818066114, "grad_norm": 15.194357872009277, "learning_rate": 3.935941406971085e-05, "loss": 2.748, "step": 1024500 }, { "epoch": 2.145644237532237, "grad_norm": 13.201043128967285, "learning_rate": 3.935416981322056e-05, "loss": 2.7825, "step": 1025000 }, { "epoch": 2.1466908932578623, "grad_norm": 11.920708656311035, "learning_rate": 3.9348925556730274e-05, "loss": 2.7639, "step": 1025500 }, { "epoch": 2.147737548983488, "grad_norm": 14.623852729797363, "learning_rate": 3.9343681300239985e-05, "loss": 2.7824, "step": 1026000 }, { "epoch": 2.1487842047091132, "grad_norm": 21.44986915588379, "learning_rate": 3.933843704374969e-05, "loss": 2.7722, "step": 1026500 }, { "epoch": 2.149830860434739, "grad_norm": 49.4228401184082, "learning_rate": 3.9333192787259394e-05, "loss": 2.7773, "step": 1027000 }, { "epoch": 2.1508775161603646, "grad_norm": 15.415539741516113, "learning_rate": 3.9327948530769105e-05, "loss": 2.7566, "step": 1027500 }, { "epoch": 2.15192417188599, "grad_norm": 12.265122413635254, "learning_rate": 3.932270427427881e-05, "loss": 2.7818, "step": 1028000 }, { "epoch": 2.1529708276116155, "grad_norm": 11.6383638381958, "learning_rate": 3.931746001778852e-05, "loss": 2.7554, "step": 1028500 }, { "epoch": 2.154017483337241, "grad_norm": 14.248554229736328, "learning_rate": 3.9312215761298224e-05, "loss": 2.7538, "step": 1029000 }, { "epoch": 2.1550641390628664, "grad_norm": 13.85700511932373, "learning_rate": 3.9306971504807935e-05, "loss": 2.7706, "step": 1029500 }, { "epoch": 2.156110794788492, "grad_norm": 19.83302116394043, "learning_rate": 3.9301727248317647e-05, "loss": 2.7441, "step": 1030000 }, { "epoch": 2.1571574505141173, "grad_norm": 14.587701797485352, "learning_rate": 3.929648299182736e-05, "loss": 2.7613, "step": 1030500 }, { "epoch": 2.158204106239743, "grad_norm": 17.43794059753418, "learning_rate": 3.929123873533706e-05, "loss": 2.7818, "step": 1031000 }, { "epoch": 2.1592507619653682, "grad_norm": 12.122142791748047, "learning_rate": 3.928599447884677e-05, "loss": 2.7615, "step": 1031500 }, { "epoch": 2.1602974176909937, "grad_norm": 13.598921775817871, "learning_rate": 3.928075022235648e-05, "loss": 2.7561, "step": 1032000 }, { "epoch": 2.161344073416619, "grad_norm": 12.576292037963867, "learning_rate": 3.927550596586618e-05, "loss": 2.7573, "step": 1032500 }, { "epoch": 2.1623907291422446, "grad_norm": 15.995280265808105, "learning_rate": 3.927026170937589e-05, "loss": 2.7623, "step": 1033000 }, { "epoch": 2.16343738486787, "grad_norm": 17.217166900634766, "learning_rate": 3.92650174528856e-05, "loss": 2.7793, "step": 1033500 }, { "epoch": 2.1644840405934955, "grad_norm": 16.539335250854492, "learning_rate": 3.925977319639531e-05, "loss": 2.783, "step": 1034000 }, { "epoch": 2.165530696319121, "grad_norm": 19.74042510986328, "learning_rate": 3.925452893990502e-05, "loss": 2.752, "step": 1034500 }, { "epoch": 2.1665773520447464, "grad_norm": 17.728479385375977, "learning_rate": 3.924928468341473e-05, "loss": 2.7451, "step": 1035000 }, { "epoch": 2.167624007770372, "grad_norm": 72.37450408935547, "learning_rate": 3.9244040426924435e-05, "loss": 2.7746, "step": 1035500 }, { "epoch": 2.1686706634959974, "grad_norm": 12.831624984741211, "learning_rate": 3.9238796170434146e-05, "loss": 2.774, "step": 1036000 }, { "epoch": 2.1697173192216233, "grad_norm": 41.28123474121094, "learning_rate": 3.923355191394385e-05, "loss": 2.7502, "step": 1036500 }, { "epoch": 2.1707639749472487, "grad_norm": 19.11534881591797, "learning_rate": 3.922830765745356e-05, "loss": 2.7621, "step": 1037000 }, { "epoch": 2.171810630672874, "grad_norm": 13.684256553649902, "learning_rate": 3.9223063400963265e-05, "loss": 2.7602, "step": 1037500 }, { "epoch": 2.1728572863984996, "grad_norm": 12.285216331481934, "learning_rate": 3.921781914447297e-05, "loss": 2.7631, "step": 1038000 }, { "epoch": 2.173903942124125, "grad_norm": 12.935944557189941, "learning_rate": 3.921257488798268e-05, "loss": 2.7721, "step": 1038500 }, { "epoch": 2.1749505978497505, "grad_norm": 16.02663803100586, "learning_rate": 3.920733063149239e-05, "loss": 2.7727, "step": 1039000 }, { "epoch": 2.175997253575376, "grad_norm": 15.656719207763672, "learning_rate": 3.92020863750021e-05, "loss": 2.761, "step": 1039500 }, { "epoch": 2.1770439093010014, "grad_norm": 13.260160446166992, "learning_rate": 3.919684211851181e-05, "loss": 2.7663, "step": 1040000 }, { "epoch": 2.178090565026627, "grad_norm": Infinity, "learning_rate": 3.919159786202152e-05, "loss": 2.7595, "step": 1040500 }, { "epoch": 2.1791372207522524, "grad_norm": 10.168133735656738, "learning_rate": 3.918635360553122e-05, "loss": 2.7635, "step": 1041000 }, { "epoch": 2.180183876477878, "grad_norm": 14.210487365722656, "learning_rate": 3.9181109349040934e-05, "loss": 2.7528, "step": 1041500 }, { "epoch": 2.1812305322035033, "grad_norm": 13.887633323669434, "learning_rate": 3.917586509255064e-05, "loss": 2.7586, "step": 1042000 }, { "epoch": 2.1822771879291287, "grad_norm": 14.587437629699707, "learning_rate": 3.917062083606035e-05, "loss": 2.7506, "step": 1042500 }, { "epoch": 2.183323843654754, "grad_norm": 12.414336204528809, "learning_rate": 3.916537657957005e-05, "loss": 2.7741, "step": 1043000 }, { "epoch": 2.1843704993803796, "grad_norm": 15.170914649963379, "learning_rate": 3.9160132323079764e-05, "loss": 2.7425, "step": 1043500 }, { "epoch": 2.185417155106005, "grad_norm": 13.728641510009766, "learning_rate": 3.9154888066589475e-05, "loss": 2.7459, "step": 1044000 }, { "epoch": 2.186463810831631, "grad_norm": 13.968433380126953, "learning_rate": 3.914964381009918e-05, "loss": 2.7537, "step": 1044500 }, { "epoch": 2.1875104665572564, "grad_norm": 15.812704086303711, "learning_rate": 3.914439955360889e-05, "loss": 2.7684, "step": 1045000 }, { "epoch": 2.188557122282882, "grad_norm": 15.20522689819336, "learning_rate": 3.9139155297118595e-05, "loss": 2.7498, "step": 1045500 }, { "epoch": 2.1896037780085074, "grad_norm": 11.657735824584961, "learning_rate": 3.9133911040628306e-05, "loss": 2.7641, "step": 1046000 }, { "epoch": 2.190650433734133, "grad_norm": 14.412019729614258, "learning_rate": 3.912866678413801e-05, "loss": 2.763, "step": 1046500 }, { "epoch": 2.1916970894597583, "grad_norm": 14.68665599822998, "learning_rate": 3.912342252764772e-05, "loss": 2.7521, "step": 1047000 }, { "epoch": 2.1927437451853837, "grad_norm": 12.340994834899902, "learning_rate": 3.911817827115743e-05, "loss": 2.7268, "step": 1047500 }, { "epoch": 2.193790400911009, "grad_norm": 13.393275260925293, "learning_rate": 3.9112934014667144e-05, "loss": 2.7737, "step": 1048000 }, { "epoch": 2.1948370566366346, "grad_norm": 13.270233154296875, "learning_rate": 3.910768975817685e-05, "loss": 2.7628, "step": 1048500 }, { "epoch": 2.19588371236226, "grad_norm": 12.267745018005371, "learning_rate": 3.910244550168655e-05, "loss": 2.7513, "step": 1049000 }, { "epoch": 2.1969303680878856, "grad_norm": 12.45742416381836, "learning_rate": 3.9097201245196263e-05, "loss": 2.7544, "step": 1049500 }, { "epoch": 2.197977023813511, "grad_norm": 16.110301971435547, "learning_rate": 3.909195698870597e-05, "loss": 2.7537, "step": 1050000 }, { "epoch": 2.1990236795391365, "grad_norm": 15.964211463928223, "learning_rate": 3.908671273221568e-05, "loss": 2.7495, "step": 1050500 }, { "epoch": 2.200070335264762, "grad_norm": 13.448522567749023, "learning_rate": 3.908146847572538e-05, "loss": 2.7556, "step": 1051000 }, { "epoch": 2.2011169909903874, "grad_norm": 11.685571670532227, "learning_rate": 3.9076224219235094e-05, "loss": 2.7528, "step": 1051500 }, { "epoch": 2.202163646716013, "grad_norm": 12.173174858093262, "learning_rate": 3.9070979962744805e-05, "loss": 2.755, "step": 1052000 }, { "epoch": 2.2032103024416383, "grad_norm": 19.276424407958984, "learning_rate": 3.9065735706254516e-05, "loss": 2.7445, "step": 1052500 }, { "epoch": 2.2042569581672637, "grad_norm": 13.642202377319336, "learning_rate": 3.906049144976422e-05, "loss": 2.7606, "step": 1053000 }, { "epoch": 2.2053036138928896, "grad_norm": 17.11703109741211, "learning_rate": 3.905524719327393e-05, "loss": 2.7658, "step": 1053500 }, { "epoch": 2.206350269618515, "grad_norm": 20.26555633544922, "learning_rate": 3.9050002936783636e-05, "loss": 2.7696, "step": 1054000 }, { "epoch": 2.2073969253441406, "grad_norm": 17.857336044311523, "learning_rate": 3.904475868029334e-05, "loss": 2.7615, "step": 1054500 }, { "epoch": 2.208443581069766, "grad_norm": 12.641064643859863, "learning_rate": 3.903951442380305e-05, "loss": 2.7373, "step": 1055000 }, { "epoch": 2.2094902367953915, "grad_norm": 15.534220695495605, "learning_rate": 3.9034270167312756e-05, "loss": 2.7457, "step": 1055500 }, { "epoch": 2.210536892521017, "grad_norm": 13.493969917297363, "learning_rate": 3.902902591082247e-05, "loss": 2.76, "step": 1056000 }, { "epoch": 2.2115835482466424, "grad_norm": 15.922874450683594, "learning_rate": 3.902378165433218e-05, "loss": 2.7469, "step": 1056500 }, { "epoch": 2.212630203972268, "grad_norm": 13.536073684692383, "learning_rate": 3.901853739784189e-05, "loss": 2.7402, "step": 1057000 }, { "epoch": 2.2136768596978933, "grad_norm": 12.747513771057129, "learning_rate": 3.901329314135159e-05, "loss": 2.7682, "step": 1057500 }, { "epoch": 2.2147235154235188, "grad_norm": 14.891782760620117, "learning_rate": 3.9008048884861304e-05, "loss": 2.7308, "step": 1058000 }, { "epoch": 2.215770171149144, "grad_norm": 13.077167510986328, "learning_rate": 3.900280462837101e-05, "loss": 2.7508, "step": 1058500 }, { "epoch": 2.2168168268747697, "grad_norm": 17.25847053527832, "learning_rate": 3.899756037188072e-05, "loss": 2.7523, "step": 1059000 }, { "epoch": 2.217863482600395, "grad_norm": 12.920720100402832, "learning_rate": 3.8992316115390424e-05, "loss": 2.7687, "step": 1059500 }, { "epoch": 2.2189101383260206, "grad_norm": 13.318377494812012, "learning_rate": 3.8987071858900135e-05, "loss": 2.7532, "step": 1060000 }, { "epoch": 2.219956794051646, "grad_norm": 13.616296768188477, "learning_rate": 3.898182760240984e-05, "loss": 2.7517, "step": 1060500 }, { "epoch": 2.2210034497772715, "grad_norm": 14.827589988708496, "learning_rate": 3.897658334591955e-05, "loss": 2.7688, "step": 1061000 }, { "epoch": 2.222050105502897, "grad_norm": 21.568086624145508, "learning_rate": 3.897133908942926e-05, "loss": 2.7556, "step": 1061500 }, { "epoch": 2.223096761228523, "grad_norm": 12.597139358520508, "learning_rate": 3.8966094832938966e-05, "loss": 2.7497, "step": 1062000 }, { "epoch": 2.2241434169541483, "grad_norm": 12.208361625671387, "learning_rate": 3.896085057644868e-05, "loss": 2.7583, "step": 1062500 }, { "epoch": 2.2251900726797738, "grad_norm": 14.259383201599121, "learning_rate": 3.895560631995838e-05, "loss": 2.7524, "step": 1063000 }, { "epoch": 2.226236728405399, "grad_norm": 12.88662338256836, "learning_rate": 3.895036206346809e-05, "loss": 2.751, "step": 1063500 }, { "epoch": 2.2272833841310247, "grad_norm": 13.749262809753418, "learning_rate": 3.89451178069778e-05, "loss": 2.7721, "step": 1064000 }, { "epoch": 2.22833003985665, "grad_norm": 14.532735824584961, "learning_rate": 3.893987355048751e-05, "loss": 2.7517, "step": 1064500 }, { "epoch": 2.2293766955822756, "grad_norm": 15.050461769104004, "learning_rate": 3.893462929399722e-05, "loss": 2.7627, "step": 1065000 }, { "epoch": 2.230423351307901, "grad_norm": 17.941728591918945, "learning_rate": 3.892938503750693e-05, "loss": 2.7443, "step": 1065500 }, { "epoch": 2.2314700070335265, "grad_norm": 13.787934303283691, "learning_rate": 3.8924140781016634e-05, "loss": 2.7227, "step": 1066000 }, { "epoch": 2.232516662759152, "grad_norm": 12.009328842163086, "learning_rate": 3.891889652452634e-05, "loss": 2.7518, "step": 1066500 }, { "epoch": 2.2335633184847774, "grad_norm": 13.729120254516602, "learning_rate": 3.891365226803605e-05, "loss": 2.7146, "step": 1067000 }, { "epoch": 2.234609974210403, "grad_norm": 12.171625137329102, "learning_rate": 3.8908408011545754e-05, "loss": 2.7527, "step": 1067500 }, { "epoch": 2.2356566299360283, "grad_norm": 15.028297424316406, "learning_rate": 3.8903163755055465e-05, "loss": 2.75, "step": 1068000 }, { "epoch": 2.2367032856616538, "grad_norm": 13.189164161682129, "learning_rate": 3.889791949856517e-05, "loss": 2.7646, "step": 1068500 }, { "epoch": 2.2377499413872792, "grad_norm": 12.829035758972168, "learning_rate": 3.889267524207488e-05, "loss": 2.7409, "step": 1069000 }, { "epoch": 2.2387965971129047, "grad_norm": 14.08764362335205, "learning_rate": 3.888743098558459e-05, "loss": 2.7447, "step": 1069500 }, { "epoch": 2.23984325283853, "grad_norm": 12.208657264709473, "learning_rate": 3.88821867290943e-05, "loss": 2.759, "step": 1070000 }, { "epoch": 2.2408899085641556, "grad_norm": 12.280634880065918, "learning_rate": 3.887694247260401e-05, "loss": 2.7447, "step": 1070500 }, { "epoch": 2.2419365642897815, "grad_norm": 11.635970115661621, "learning_rate": 3.887169821611372e-05, "loss": 2.7456, "step": 1071000 }, { "epoch": 2.242983220015407, "grad_norm": 11.628758430480957, "learning_rate": 3.886645395962342e-05, "loss": 2.7565, "step": 1071500 }, { "epoch": 2.2440298757410324, "grad_norm": 12.790730476379395, "learning_rate": 3.8861209703133127e-05, "loss": 2.7472, "step": 1072000 }, { "epoch": 2.245076531466658, "grad_norm": 12.901609420776367, "learning_rate": 3.885596544664284e-05, "loss": 2.7393, "step": 1072500 }, { "epoch": 2.2461231871922833, "grad_norm": 14.880908966064453, "learning_rate": 3.885072119015254e-05, "loss": 2.7393, "step": 1073000 }, { "epoch": 2.2471698429179088, "grad_norm": 13.23674488067627, "learning_rate": 3.884547693366225e-05, "loss": 2.7496, "step": 1073500 }, { "epoch": 2.2482164986435342, "grad_norm": 15.37663459777832, "learning_rate": 3.8840232677171964e-05, "loss": 2.755, "step": 1074000 }, { "epoch": 2.2492631543691597, "grad_norm": 15.731942176818848, "learning_rate": 3.8834988420681675e-05, "loss": 2.7361, "step": 1074500 }, { "epoch": 2.250309810094785, "grad_norm": 15.65284538269043, "learning_rate": 3.882974416419138e-05, "loss": 2.7472, "step": 1075000 }, { "epoch": 2.2513564658204106, "grad_norm": 11.456273078918457, "learning_rate": 3.882449990770109e-05, "loss": 2.762, "step": 1075500 }, { "epoch": 2.252403121546036, "grad_norm": 12.968689918518066, "learning_rate": 3.8819255651210795e-05, "loss": 2.7548, "step": 1076000 }, { "epoch": 2.2534497772716615, "grad_norm": 13.303078651428223, "learning_rate": 3.8814011394720506e-05, "loss": 2.7459, "step": 1076500 }, { "epoch": 2.254496432997287, "grad_norm": 14.558762550354004, "learning_rate": 3.880876713823021e-05, "loss": 2.7437, "step": 1077000 }, { "epoch": 2.2555430887229124, "grad_norm": 19.429729461669922, "learning_rate": 3.8803522881739914e-05, "loss": 2.7553, "step": 1077500 }, { "epoch": 2.256589744448538, "grad_norm": 14.164487838745117, "learning_rate": 3.8798278625249626e-05, "loss": 2.7415, "step": 1078000 }, { "epoch": 2.2576364001741633, "grad_norm": 12.570723533630371, "learning_rate": 3.879303436875934e-05, "loss": 2.726, "step": 1078500 }, { "epoch": 2.2586830558997892, "grad_norm": 13.834267616271973, "learning_rate": 3.878779011226905e-05, "loss": 2.7313, "step": 1079000 }, { "epoch": 2.2597297116254147, "grad_norm": 12.613884925842285, "learning_rate": 3.878254585577875e-05, "loss": 2.7396, "step": 1079500 }, { "epoch": 2.26077636735104, "grad_norm": 16.546222686767578, "learning_rate": 3.877730159928846e-05, "loss": 2.7612, "step": 1080000 }, { "epoch": 2.2618230230766656, "grad_norm": 14.192590713500977, "learning_rate": 3.877205734279817e-05, "loss": 2.7362, "step": 1080500 }, { "epoch": 2.262869678802291, "grad_norm": 13.864179611206055, "learning_rate": 3.876681308630788e-05, "loss": 2.7331, "step": 1081000 }, { "epoch": 2.2639163345279165, "grad_norm": 13.366903305053711, "learning_rate": 3.876156882981758e-05, "loss": 2.7331, "step": 1081500 }, { "epoch": 2.264962990253542, "grad_norm": 17.478424072265625, "learning_rate": 3.8756324573327294e-05, "loss": 2.752, "step": 1082000 }, { "epoch": 2.2660096459791674, "grad_norm": 12.220684051513672, "learning_rate": 3.8751080316837005e-05, "loss": 2.7551, "step": 1082500 }, { "epoch": 2.267056301704793, "grad_norm": 13.524785041809082, "learning_rate": 3.874583606034671e-05, "loss": 2.7535, "step": 1083000 }, { "epoch": 2.2681029574304183, "grad_norm": 13.156634330749512, "learning_rate": 3.874059180385642e-05, "loss": 2.7405, "step": 1083500 }, { "epoch": 2.269149613156044, "grad_norm": 15.005990028381348, "learning_rate": 3.8735347547366125e-05, "loss": 2.7459, "step": 1084000 }, { "epoch": 2.2701962688816693, "grad_norm": 14.163520812988281, "learning_rate": 3.8730103290875836e-05, "loss": 2.7477, "step": 1084500 }, { "epoch": 2.2712429246072947, "grad_norm": 13.066126823425293, "learning_rate": 3.872485903438554e-05, "loss": 2.7555, "step": 1085000 }, { "epoch": 2.27228958033292, "grad_norm": 17.747356414794922, "learning_rate": 3.871961477789525e-05, "loss": 2.7639, "step": 1085500 }, { "epoch": 2.2733362360585456, "grad_norm": 12.309369087219238, "learning_rate": 3.8714370521404955e-05, "loss": 2.7376, "step": 1086000 }, { "epoch": 2.274382891784171, "grad_norm": 22.801292419433594, "learning_rate": 3.8709126264914666e-05, "loss": 2.7558, "step": 1086500 }, { "epoch": 2.2754295475097965, "grad_norm": 14.729948997497559, "learning_rate": 3.870388200842438e-05, "loss": 2.7419, "step": 1087000 }, { "epoch": 2.276476203235422, "grad_norm": 13.112455368041992, "learning_rate": 3.869863775193409e-05, "loss": 2.7654, "step": 1087500 }, { "epoch": 2.2775228589610474, "grad_norm": 12.218644142150879, "learning_rate": 3.869339349544379e-05, "loss": 2.7476, "step": 1088000 }, { "epoch": 2.278569514686673, "grad_norm": 13.460307121276855, "learning_rate": 3.8688149238953504e-05, "loss": 2.735, "step": 1088500 }, { "epoch": 2.279616170412299, "grad_norm": 20.68172836303711, "learning_rate": 3.868290498246321e-05, "loss": 2.7174, "step": 1089000 }, { "epoch": 2.2806628261379243, "grad_norm": 14.887578010559082, "learning_rate": 3.867766072597291e-05, "loss": 2.7371, "step": 1089500 }, { "epoch": 2.2817094818635497, "grad_norm": 13.423616409301758, "learning_rate": 3.8672416469482624e-05, "loss": 2.7355, "step": 1090000 }, { "epoch": 2.282756137589175, "grad_norm": 16.067533493041992, "learning_rate": 3.866717221299233e-05, "loss": 2.7395, "step": 1090500 }, { "epoch": 2.2838027933148006, "grad_norm": 14.982264518737793, "learning_rate": 3.866192795650204e-05, "loss": 2.733, "step": 1091000 }, { "epoch": 2.284849449040426, "grad_norm": 13.10973072052002, "learning_rate": 3.865668370001175e-05, "loss": 2.736, "step": 1091500 }, { "epoch": 2.2858961047660515, "grad_norm": 20.353116989135742, "learning_rate": 3.865143944352146e-05, "loss": 2.7432, "step": 1092000 }, { "epoch": 2.286942760491677, "grad_norm": 13.08413028717041, "learning_rate": 3.8646195187031166e-05, "loss": 2.7357, "step": 1092500 }, { "epoch": 2.2879894162173025, "grad_norm": 10.873072624206543, "learning_rate": 3.864095093054088e-05, "loss": 2.7442, "step": 1093000 }, { "epoch": 2.289036071942928, "grad_norm": 15.591022491455078, "learning_rate": 3.863570667405058e-05, "loss": 2.7477, "step": 1093500 }, { "epoch": 2.2900827276685534, "grad_norm": 14.067252159118652, "learning_rate": 3.863046241756029e-05, "loss": 2.7207, "step": 1094000 }, { "epoch": 2.291129383394179, "grad_norm": 16.22450828552246, "learning_rate": 3.8625218161069996e-05, "loss": 2.7468, "step": 1094500 }, { "epoch": 2.2921760391198043, "grad_norm": 11.786515235900879, "learning_rate": 3.86199739045797e-05, "loss": 2.7422, "step": 1095000 }, { "epoch": 2.2932226948454297, "grad_norm": 13.14480972290039, "learning_rate": 3.861472964808941e-05, "loss": 2.7336, "step": 1095500 }, { "epoch": 2.294269350571055, "grad_norm": 13.058051109313965, "learning_rate": 3.860948539159912e-05, "loss": 2.7318, "step": 1096000 }, { "epoch": 2.295316006296681, "grad_norm": 11.890332221984863, "learning_rate": 3.8604241135108834e-05, "loss": 2.7406, "step": 1096500 }, { "epoch": 2.2963626620223065, "grad_norm": 12.377656936645508, "learning_rate": 3.859899687861854e-05, "loss": 2.7407, "step": 1097000 }, { "epoch": 2.297409317747932, "grad_norm": 12.993693351745605, "learning_rate": 3.859375262212825e-05, "loss": 2.7244, "step": 1097500 }, { "epoch": 2.2984559734735575, "grad_norm": 13.692402839660645, "learning_rate": 3.8588508365637954e-05, "loss": 2.7554, "step": 1098000 }, { "epoch": 2.299502629199183, "grad_norm": 11.749621391296387, "learning_rate": 3.8583264109147665e-05, "loss": 2.7409, "step": 1098500 }, { "epoch": 2.3005492849248084, "grad_norm": 12.368427276611328, "learning_rate": 3.857801985265737e-05, "loss": 2.735, "step": 1099000 }, { "epoch": 2.301595940650434, "grad_norm": 13.62158203125, "learning_rate": 3.857277559616708e-05, "loss": 2.7124, "step": 1099500 }, { "epoch": 2.3026425963760593, "grad_norm": 13.4213285446167, "learning_rate": 3.856753133967679e-05, "loss": 2.7418, "step": 1100000 }, { "epoch": 2.3036892521016847, "grad_norm": 13.483366012573242, "learning_rate": 3.8562287083186495e-05, "loss": 2.7484, "step": 1100500 }, { "epoch": 2.30473590782731, "grad_norm": 14.803101539611816, "learning_rate": 3.8557042826696206e-05, "loss": 2.7334, "step": 1101000 }, { "epoch": 2.3057825635529356, "grad_norm": 12.153907775878906, "learning_rate": 3.855179857020591e-05, "loss": 2.7307, "step": 1101500 }, { "epoch": 2.306829219278561, "grad_norm": 11.179826736450195, "learning_rate": 3.854655431371562e-05, "loss": 2.7511, "step": 1102000 }, { "epoch": 2.3078758750041866, "grad_norm": 12.672547340393066, "learning_rate": 3.8541310057225326e-05, "loss": 2.7529, "step": 1102500 }, { "epoch": 2.308922530729812, "grad_norm": 13.020709037780762, "learning_rate": 3.853606580073504e-05, "loss": 2.7251, "step": 1103000 }, { "epoch": 2.3099691864554375, "grad_norm": 12.958050727844238, "learning_rate": 3.853082154424474e-05, "loss": 2.7343, "step": 1103500 }, { "epoch": 2.311015842181063, "grad_norm": 12.83717155456543, "learning_rate": 3.852557728775445e-05, "loss": 2.7313, "step": 1104000 }, { "epoch": 2.3120624979066884, "grad_norm": 15.127089500427246, "learning_rate": 3.8520333031264164e-05, "loss": 2.7466, "step": 1104500 }, { "epoch": 2.313109153632314, "grad_norm": 13.332708358764648, "learning_rate": 3.8515088774773875e-05, "loss": 2.7224, "step": 1105000 }, { "epoch": 2.3141558093579393, "grad_norm": 13.272042274475098, "learning_rate": 3.850984451828358e-05, "loss": 2.7292, "step": 1105500 }, { "epoch": 2.3152024650835648, "grad_norm": 11.951262474060059, "learning_rate": 3.850460026179328e-05, "loss": 2.7298, "step": 1106000 }, { "epoch": 2.3162491208091907, "grad_norm": 12.080782890319824, "learning_rate": 3.8499356005302994e-05, "loss": 2.735, "step": 1106500 }, { "epoch": 2.317295776534816, "grad_norm": 17.652856826782227, "learning_rate": 3.84941117488127e-05, "loss": 2.7181, "step": 1107000 }, { "epoch": 2.3183424322604416, "grad_norm": 13.926711082458496, "learning_rate": 3.848886749232241e-05, "loss": 2.734, "step": 1107500 }, { "epoch": 2.319389087986067, "grad_norm": 12.091828346252441, "learning_rate": 3.8483623235832114e-05, "loss": 2.7357, "step": 1108000 }, { "epoch": 2.3204357437116925, "grad_norm": 12.62860107421875, "learning_rate": 3.8478378979341825e-05, "loss": 2.7304, "step": 1108500 }, { "epoch": 2.321482399437318, "grad_norm": 15.59883975982666, "learning_rate": 3.8473134722851536e-05, "loss": 2.7248, "step": 1109000 }, { "epoch": 2.3225290551629434, "grad_norm": 12.351410865783691, "learning_rate": 3.846789046636125e-05, "loss": 2.7347, "step": 1109500 }, { "epoch": 2.323575710888569, "grad_norm": 12.903670310974121, "learning_rate": 3.846264620987095e-05, "loss": 2.7355, "step": 1110000 }, { "epoch": 2.3246223666141943, "grad_norm": 13.00476360321045, "learning_rate": 3.845740195338066e-05, "loss": 2.7295, "step": 1110500 }, { "epoch": 2.3256690223398198, "grad_norm": 13.23592758178711, "learning_rate": 3.845215769689037e-05, "loss": 2.743, "step": 1111000 }, { "epoch": 2.326715678065445, "grad_norm": 11.601378440856934, "learning_rate": 3.844691344040007e-05, "loss": 2.7411, "step": 1111500 }, { "epoch": 2.3277623337910707, "grad_norm": 11.092071533203125, "learning_rate": 3.844166918390978e-05, "loss": 2.7217, "step": 1112000 }, { "epoch": 2.328808989516696, "grad_norm": 13.533218383789062, "learning_rate": 3.843642492741949e-05, "loss": 2.7374, "step": 1112500 }, { "epoch": 2.3298556452423216, "grad_norm": 12.104419708251953, "learning_rate": 3.84311806709292e-05, "loss": 2.7154, "step": 1113000 }, { "epoch": 2.330902300967947, "grad_norm": 11.233989715576172, "learning_rate": 3.842593641443891e-05, "loss": 2.7449, "step": 1113500 }, { "epoch": 2.331948956693573, "grad_norm": 12.665777206420898, "learning_rate": 3.842069215794862e-05, "loss": 2.7455, "step": 1114000 }, { "epoch": 2.3329956124191984, "grad_norm": 13.22152328491211, "learning_rate": 3.8415447901458324e-05, "loss": 2.7423, "step": 1114500 }, { "epoch": 2.334042268144824, "grad_norm": 13.139724731445312, "learning_rate": 3.8410203644968035e-05, "loss": 2.7492, "step": 1115000 }, { "epoch": 2.3350889238704493, "grad_norm": 13.09311580657959, "learning_rate": 3.840495938847774e-05, "loss": 2.7166, "step": 1115500 }, { "epoch": 2.3361355795960748, "grad_norm": 15.967026710510254, "learning_rate": 3.839971513198745e-05, "loss": 2.7103, "step": 1116000 }, { "epoch": 2.3371822353217, "grad_norm": 38.33979415893555, "learning_rate": 3.8394470875497155e-05, "loss": 2.7389, "step": 1116500 }, { "epoch": 2.3382288910473257, "grad_norm": 13.629314422607422, "learning_rate": 3.8389226619006866e-05, "loss": 2.7381, "step": 1117000 }, { "epoch": 2.339275546772951, "grad_norm": 13.243985176086426, "learning_rate": 3.838398236251658e-05, "loss": 2.714, "step": 1117500 }, { "epoch": 2.3403222024985766, "grad_norm": 14.882904052734375, "learning_rate": 3.837873810602628e-05, "loss": 2.7108, "step": 1118000 }, { "epoch": 2.341368858224202, "grad_norm": 13.399284362792969, "learning_rate": 3.837349384953599e-05, "loss": 2.7482, "step": 1118500 }, { "epoch": 2.3424155139498275, "grad_norm": 14.221741676330566, "learning_rate": 3.83682495930457e-05, "loss": 2.7345, "step": 1119000 }, { "epoch": 2.343462169675453, "grad_norm": 13.465471267700195, "learning_rate": 3.836300533655541e-05, "loss": 2.7434, "step": 1119500 }, { "epoch": 2.3445088254010784, "grad_norm": 11.586612701416016, "learning_rate": 3.835776108006511e-05, "loss": 2.7137, "step": 1120000 }, { "epoch": 2.345555481126704, "grad_norm": 12.145007133483887, "learning_rate": 3.835251682357482e-05, "loss": 2.7317, "step": 1120500 }, { "epoch": 2.3466021368523293, "grad_norm": 14.292471885681152, "learning_rate": 3.834727256708453e-05, "loss": 2.7275, "step": 1121000 }, { "epoch": 2.347648792577955, "grad_norm": 13.162945747375488, "learning_rate": 3.834202831059424e-05, "loss": 2.7532, "step": 1121500 }, { "epoch": 2.3486954483035802, "grad_norm": 11.76492691040039, "learning_rate": 3.833678405410395e-05, "loss": 2.7348, "step": 1122000 }, { "epoch": 2.3497421040292057, "grad_norm": 16.426212310791016, "learning_rate": 3.833153979761366e-05, "loss": 2.727, "step": 1122500 }, { "epoch": 2.350788759754831, "grad_norm": 13.454987525939941, "learning_rate": 3.8326295541123365e-05, "loss": 2.7231, "step": 1123000 }, { "epoch": 2.3518354154804566, "grad_norm": 16.29376220703125, "learning_rate": 3.832105128463307e-05, "loss": 2.7189, "step": 1123500 }, { "epoch": 2.3528820712060825, "grad_norm": 12.4351806640625, "learning_rate": 3.831580702814278e-05, "loss": 2.7447, "step": 1124000 }, { "epoch": 2.353928726931708, "grad_norm": 12.135993957519531, "learning_rate": 3.8310562771652485e-05, "loss": 2.7214, "step": 1124500 }, { "epoch": 2.3549753826573334, "grad_norm": 11.651653289794922, "learning_rate": 3.8305318515162196e-05, "loss": 2.7144, "step": 1125000 }, { "epoch": 2.356022038382959, "grad_norm": 13.431503295898438, "learning_rate": 3.83000742586719e-05, "loss": 2.7202, "step": 1125500 }, { "epoch": 2.3570686941085843, "grad_norm": 13.282185554504395, "learning_rate": 3.829483000218161e-05, "loss": 2.7387, "step": 1126000 }, { "epoch": 2.35811534983421, "grad_norm": 12.127735137939453, "learning_rate": 3.828958574569132e-05, "loss": 2.7199, "step": 1126500 }, { "epoch": 2.3591620055598352, "grad_norm": 15.265116691589355, "learning_rate": 3.8284341489201034e-05, "loss": 2.7139, "step": 1127000 }, { "epoch": 2.3602086612854607, "grad_norm": 13.664671897888184, "learning_rate": 3.827909723271074e-05, "loss": 2.7629, "step": 1127500 }, { "epoch": 2.361255317011086, "grad_norm": 12.986162185668945, "learning_rate": 3.827385297622045e-05, "loss": 2.728, "step": 1128000 }, { "epoch": 2.3623019727367116, "grad_norm": 14.109620094299316, "learning_rate": 3.826860871973015e-05, "loss": 2.7019, "step": 1128500 }, { "epoch": 2.363348628462337, "grad_norm": 14.408512115478516, "learning_rate": 3.826336446323986e-05, "loss": 2.7362, "step": 1129000 }, { "epoch": 2.3643952841879625, "grad_norm": 14.552087783813477, "learning_rate": 3.825812020674957e-05, "loss": 2.7324, "step": 1129500 }, { "epoch": 2.365441939913588, "grad_norm": 14.164555549621582, "learning_rate": 3.825287595025927e-05, "loss": 2.7212, "step": 1130000 }, { "epoch": 2.3664885956392134, "grad_norm": 13.732348442077637, "learning_rate": 3.8247631693768984e-05, "loss": 2.7271, "step": 1130500 }, { "epoch": 2.367535251364839, "grad_norm": 11.334239959716797, "learning_rate": 3.8242387437278695e-05, "loss": 2.7235, "step": 1131000 }, { "epoch": 2.368581907090465, "grad_norm": 11.131763458251953, "learning_rate": 3.8237143180788406e-05, "loss": 2.7235, "step": 1131500 }, { "epoch": 2.3696285628160902, "grad_norm": 13.976936340332031, "learning_rate": 3.823189892429811e-05, "loss": 2.7261, "step": 1132000 }, { "epoch": 2.3706752185417157, "grad_norm": 16.449399948120117, "learning_rate": 3.822665466780782e-05, "loss": 2.7283, "step": 1132500 }, { "epoch": 2.371721874267341, "grad_norm": 12.83079719543457, "learning_rate": 3.8221410411317526e-05, "loss": 2.7234, "step": 1133000 }, { "epoch": 2.3727685299929666, "grad_norm": 10.438515663146973, "learning_rate": 3.821616615482724e-05, "loss": 2.7263, "step": 1133500 }, { "epoch": 2.373815185718592, "grad_norm": 15.485806465148926, "learning_rate": 3.821092189833694e-05, "loss": 2.7221, "step": 1134000 }, { "epoch": 2.3748618414442175, "grad_norm": 13.848986625671387, "learning_rate": 3.8205677641846646e-05, "loss": 2.7284, "step": 1134500 }, { "epoch": 2.375908497169843, "grad_norm": 21.166187286376953, "learning_rate": 3.820043338535636e-05, "loss": 2.7199, "step": 1135000 }, { "epoch": 2.3769551528954684, "grad_norm": 19.177453994750977, "learning_rate": 3.819518912886607e-05, "loss": 2.7324, "step": 1135500 }, { "epoch": 2.378001808621094, "grad_norm": 12.854473114013672, "learning_rate": 3.818994487237578e-05, "loss": 2.7134, "step": 1136000 }, { "epoch": 2.3790484643467193, "grad_norm": 13.962855339050293, "learning_rate": 3.818470061588548e-05, "loss": 2.7201, "step": 1136500 }, { "epoch": 2.380095120072345, "grad_norm": 13.707995414733887, "learning_rate": 3.8179456359395194e-05, "loss": 2.7168, "step": 1137000 }, { "epoch": 2.3811417757979703, "grad_norm": 11.975110054016113, "learning_rate": 3.81742121029049e-05, "loss": 2.7413, "step": 1137500 }, { "epoch": 2.3821884315235957, "grad_norm": 14.611165046691895, "learning_rate": 3.816896784641461e-05, "loss": 2.7337, "step": 1138000 }, { "epoch": 2.383235087249221, "grad_norm": 16.378814697265625, "learning_rate": 3.8163723589924314e-05, "loss": 2.7287, "step": 1138500 }, { "epoch": 2.3842817429748466, "grad_norm": 13.787980079650879, "learning_rate": 3.8158479333434025e-05, "loss": 2.7196, "step": 1139000 }, { "epoch": 2.385328398700472, "grad_norm": 11.752943992614746, "learning_rate": 3.8153235076943736e-05, "loss": 2.7259, "step": 1139500 }, { "epoch": 2.3863750544260975, "grad_norm": 12.514482498168945, "learning_rate": 3.814799082045344e-05, "loss": 2.7129, "step": 1140000 }, { "epoch": 2.387421710151723, "grad_norm": 12.694448471069336, "learning_rate": 3.814274656396315e-05, "loss": 2.6997, "step": 1140500 }, { "epoch": 2.3884683658773485, "grad_norm": 16.36587905883789, "learning_rate": 3.8137502307472856e-05, "loss": 2.747, "step": 1141000 }, { "epoch": 2.3895150216029744, "grad_norm": 11.816117286682129, "learning_rate": 3.813225805098257e-05, "loss": 2.6926, "step": 1141500 }, { "epoch": 2.3905616773286, "grad_norm": 12.546745300292969, "learning_rate": 3.812701379449227e-05, "loss": 2.7112, "step": 1142000 }, { "epoch": 2.3916083330542253, "grad_norm": 13.027100563049316, "learning_rate": 3.812176953800198e-05, "loss": 2.7333, "step": 1142500 }, { "epoch": 2.3926549887798507, "grad_norm": 12.56962776184082, "learning_rate": 3.8116525281511686e-05, "loss": 2.7211, "step": 1143000 }, { "epoch": 2.393701644505476, "grad_norm": 14.452160835266113, "learning_rate": 3.81112810250214e-05, "loss": 2.7402, "step": 1143500 }, { "epoch": 2.3947483002311016, "grad_norm": 12.017755508422852, "learning_rate": 3.810603676853111e-05, "loss": 2.7184, "step": 1144000 }, { "epoch": 2.395794955956727, "grad_norm": 15.206551551818848, "learning_rate": 3.810079251204082e-05, "loss": 2.7247, "step": 1144500 }, { "epoch": 2.3968416116823525, "grad_norm": 13.501924514770508, "learning_rate": 3.8095548255550524e-05, "loss": 2.7209, "step": 1145000 }, { "epoch": 2.397888267407978, "grad_norm": 13.004738807678223, "learning_rate": 3.809030399906023e-05, "loss": 2.7152, "step": 1145500 }, { "epoch": 2.3989349231336035, "grad_norm": 12.930014610290527, "learning_rate": 3.808505974256994e-05, "loss": 2.7196, "step": 1146000 }, { "epoch": 2.399981578859229, "grad_norm": 11.935460090637207, "learning_rate": 3.8079815486079644e-05, "loss": 2.7186, "step": 1146500 }, { "epoch": 2.4010282345848544, "grad_norm": 15.254995346069336, "learning_rate": 3.8074571229589355e-05, "loss": 2.7276, "step": 1147000 }, { "epoch": 2.40207489031048, "grad_norm": 13.4544095993042, "learning_rate": 3.806932697309906e-05, "loss": 2.7251, "step": 1147500 }, { "epoch": 2.4031215460361053, "grad_norm": 16.707366943359375, "learning_rate": 3.806408271660877e-05, "loss": 2.7343, "step": 1148000 }, { "epoch": 2.4041682017617307, "grad_norm": 12.306478500366211, "learning_rate": 3.805883846011848e-05, "loss": 2.7161, "step": 1148500 }, { "epoch": 2.4052148574873566, "grad_norm": 13.230084419250488, "learning_rate": 3.805359420362819e-05, "loss": 2.7325, "step": 1149000 }, { "epoch": 2.406261513212982, "grad_norm": 15.955851554870605, "learning_rate": 3.8048349947137897e-05, "loss": 2.7169, "step": 1149500 }, { "epoch": 2.4073081689386076, "grad_norm": 12.621260643005371, "learning_rate": 3.804310569064761e-05, "loss": 2.7221, "step": 1150000 }, { "epoch": 2.408354824664233, "grad_norm": 12.709228515625, "learning_rate": 3.803786143415731e-05, "loss": 2.7315, "step": 1150500 }, { "epoch": 2.4094014803898585, "grad_norm": 12.051759719848633, "learning_rate": 3.8032617177667016e-05, "loss": 2.7211, "step": 1151000 }, { "epoch": 2.410448136115484, "grad_norm": 11.777532577514648, "learning_rate": 3.802737292117673e-05, "loss": 2.714, "step": 1151500 }, { "epoch": 2.4114947918411094, "grad_norm": 15.562692642211914, "learning_rate": 3.802212866468643e-05, "loss": 2.7148, "step": 1152000 }, { "epoch": 2.412541447566735, "grad_norm": 14.368795394897461, "learning_rate": 3.801688440819615e-05, "loss": 2.7067, "step": 1152500 }, { "epoch": 2.4135881032923603, "grad_norm": 13.223861694335938, "learning_rate": 3.8011640151705854e-05, "loss": 2.7339, "step": 1153000 }, { "epoch": 2.4146347590179857, "grad_norm": 40.77092361450195, "learning_rate": 3.8006395895215565e-05, "loss": 2.7026, "step": 1153500 }, { "epoch": 2.415681414743611, "grad_norm": 14.505212783813477, "learning_rate": 3.800115163872527e-05, "loss": 2.7278, "step": 1154000 }, { "epoch": 2.4167280704692367, "grad_norm": 11.626049041748047, "learning_rate": 3.799590738223498e-05, "loss": 2.725, "step": 1154500 }, { "epoch": 2.417774726194862, "grad_norm": 15.1699800491333, "learning_rate": 3.7990663125744685e-05, "loss": 2.7168, "step": 1155000 }, { "epoch": 2.4188213819204876, "grad_norm": 12.37629222869873, "learning_rate": 3.7985418869254396e-05, "loss": 2.721, "step": 1155500 }, { "epoch": 2.419868037646113, "grad_norm": 13.065803527832031, "learning_rate": 3.79801746127641e-05, "loss": 2.7242, "step": 1156000 }, { "epoch": 2.4209146933717385, "grad_norm": 11.265315055847168, "learning_rate": 3.797493035627381e-05, "loss": 2.7047, "step": 1156500 }, { "epoch": 2.421961349097364, "grad_norm": 12.480923652648926, "learning_rate": 3.796968609978352e-05, "loss": 2.7155, "step": 1157000 }, { "epoch": 2.4230080048229894, "grad_norm": 12.93478012084961, "learning_rate": 3.7964441843293226e-05, "loss": 2.7127, "step": 1157500 }, { "epoch": 2.424054660548615, "grad_norm": 13.096055030822754, "learning_rate": 3.795919758680294e-05, "loss": 2.6942, "step": 1158000 }, { "epoch": 2.4251013162742403, "grad_norm": 11.882911682128906, "learning_rate": 3.795395333031264e-05, "loss": 2.7094, "step": 1158500 }, { "epoch": 2.426147971999866, "grad_norm": 13.208986282348633, "learning_rate": 3.794870907382235e-05, "loss": 2.6975, "step": 1159000 }, { "epoch": 2.4271946277254917, "grad_norm": 11.927470207214355, "learning_rate": 3.794346481733206e-05, "loss": 2.7025, "step": 1159500 }, { "epoch": 2.428241283451117, "grad_norm": 12.808313369750977, "learning_rate": 3.793822056084177e-05, "loss": 2.7132, "step": 1160000 }, { "epoch": 2.4292879391767426, "grad_norm": 13.844876289367676, "learning_rate": 3.793297630435147e-05, "loss": 2.7497, "step": 1160500 }, { "epoch": 2.430334594902368, "grad_norm": 13.059405326843262, "learning_rate": 3.7927732047861184e-05, "loss": 2.7075, "step": 1161000 }, { "epoch": 2.4313812506279935, "grad_norm": 13.159467697143555, "learning_rate": 3.7922487791370895e-05, "loss": 2.718, "step": 1161500 }, { "epoch": 2.432427906353619, "grad_norm": 13.837850570678711, "learning_rate": 3.7917243534880606e-05, "loss": 2.7015, "step": 1162000 }, { "epoch": 2.4334745620792444, "grad_norm": 13.507567405700684, "learning_rate": 3.791199927839031e-05, "loss": 2.7243, "step": 1162500 }, { "epoch": 2.43452121780487, "grad_norm": 18.157470703125, "learning_rate": 3.7906755021900014e-05, "loss": 2.7116, "step": 1163000 }, { "epoch": 2.4355678735304953, "grad_norm": 14.105997085571289, "learning_rate": 3.7901510765409725e-05, "loss": 2.7084, "step": 1163500 }, { "epoch": 2.4366145292561208, "grad_norm": 11.942743301391602, "learning_rate": 3.789626650891943e-05, "loss": 2.7134, "step": 1164000 }, { "epoch": 2.437661184981746, "grad_norm": 13.767003059387207, "learning_rate": 3.789102225242914e-05, "loss": 2.7269, "step": 1164500 }, { "epoch": 2.4387078407073717, "grad_norm": 13.3131742477417, "learning_rate": 3.7885777995938845e-05, "loss": 2.7293, "step": 1165000 }, { "epoch": 2.439754496432997, "grad_norm": 13.12741756439209, "learning_rate": 3.7880533739448556e-05, "loss": 2.7242, "step": 1165500 }, { "epoch": 2.4408011521586226, "grad_norm": 12.88577938079834, "learning_rate": 3.787528948295827e-05, "loss": 2.695, "step": 1166000 }, { "epoch": 2.4418478078842485, "grad_norm": 12.207438468933105, "learning_rate": 3.787004522646798e-05, "loss": 2.7227, "step": 1166500 }, { "epoch": 2.442894463609874, "grad_norm": 15.723990440368652, "learning_rate": 3.786480096997768e-05, "loss": 2.7311, "step": 1167000 }, { "epoch": 2.4439411193354994, "grad_norm": 15.022917747497559, "learning_rate": 3.7859556713487394e-05, "loss": 2.729, "step": 1167500 }, { "epoch": 2.444987775061125, "grad_norm": 13.419880867004395, "learning_rate": 3.78543124569971e-05, "loss": 2.6927, "step": 1168000 }, { "epoch": 2.4460344307867503, "grad_norm": 13.29450798034668, "learning_rate": 3.78490682005068e-05, "loss": 2.7095, "step": 1168500 }, { "epoch": 2.4470810865123758, "grad_norm": 13.807455062866211, "learning_rate": 3.7843823944016513e-05, "loss": 2.7173, "step": 1169000 }, { "epoch": 2.4481277422380012, "grad_norm": 13.997488021850586, "learning_rate": 3.783857968752622e-05, "loss": 2.7163, "step": 1169500 }, { "epoch": 2.4491743979636267, "grad_norm": 12.905126571655273, "learning_rate": 3.7833335431035936e-05, "loss": 2.7203, "step": 1170000 }, { "epoch": 2.450221053689252, "grad_norm": 16.119474411010742, "learning_rate": 3.782809117454564e-05, "loss": 2.7085, "step": 1170500 }, { "epoch": 2.4512677094148776, "grad_norm": 12.756178855895996, "learning_rate": 3.782284691805535e-05, "loss": 2.7198, "step": 1171000 }, { "epoch": 2.452314365140503, "grad_norm": 12.167046546936035, "learning_rate": 3.7817602661565055e-05, "loss": 2.6994, "step": 1171500 }, { "epoch": 2.4533610208661285, "grad_norm": 12.921534538269043, "learning_rate": 3.7812358405074766e-05, "loss": 2.6983, "step": 1172000 }, { "epoch": 2.454407676591754, "grad_norm": 15.124780654907227, "learning_rate": 3.780711414858447e-05, "loss": 2.7153, "step": 1172500 }, { "epoch": 2.4554543323173794, "grad_norm": 13.8104829788208, "learning_rate": 3.780186989209418e-05, "loss": 2.7094, "step": 1173000 }, { "epoch": 2.456500988043005, "grad_norm": 13.526565551757812, "learning_rate": 3.7796625635603886e-05, "loss": 2.7178, "step": 1173500 }, { "epoch": 2.4575476437686303, "grad_norm": 14.23855209350586, "learning_rate": 3.77913813791136e-05, "loss": 2.7169, "step": 1174000 }, { "epoch": 2.458594299494256, "grad_norm": 12.185726165771484, "learning_rate": 3.778613712262331e-05, "loss": 2.7169, "step": 1174500 }, { "epoch": 2.4596409552198812, "grad_norm": 12.709281921386719, "learning_rate": 3.778089286613301e-05, "loss": 2.6971, "step": 1175000 }, { "epoch": 2.4606876109455067, "grad_norm": 12.016091346740723, "learning_rate": 3.7775648609642724e-05, "loss": 2.7131, "step": 1175500 }, { "epoch": 2.461734266671132, "grad_norm": 13.307774543762207, "learning_rate": 3.777040435315243e-05, "loss": 2.7265, "step": 1176000 }, { "epoch": 2.462780922396758, "grad_norm": 13.062399864196777, "learning_rate": 3.776516009666214e-05, "loss": 2.703, "step": 1176500 }, { "epoch": 2.4638275781223835, "grad_norm": 12.39004898071289, "learning_rate": 3.775991584017184e-05, "loss": 2.7188, "step": 1177000 }, { "epoch": 2.464874233848009, "grad_norm": 13.354944229125977, "learning_rate": 3.7754671583681554e-05, "loss": 2.7142, "step": 1177500 }, { "epoch": 2.4659208895736344, "grad_norm": 13.990607261657715, "learning_rate": 3.774942732719126e-05, "loss": 2.7167, "step": 1178000 }, { "epoch": 2.46696754529926, "grad_norm": 13.795412063598633, "learning_rate": 3.774418307070097e-05, "loss": 2.6967, "step": 1178500 }, { "epoch": 2.4680142010248853, "grad_norm": 12.655921936035156, "learning_rate": 3.773893881421068e-05, "loss": 2.7076, "step": 1179000 }, { "epoch": 2.469060856750511, "grad_norm": 13.44651985168457, "learning_rate": 3.7733694557720385e-05, "loss": 2.6969, "step": 1179500 }, { "epoch": 2.4701075124761362, "grad_norm": 13.336355209350586, "learning_rate": 3.7728450301230096e-05, "loss": 2.7077, "step": 1180000 }, { "epoch": 2.4711541682017617, "grad_norm": 13.633807182312012, "learning_rate": 3.77232060447398e-05, "loss": 2.7169, "step": 1180500 }, { "epoch": 2.472200823927387, "grad_norm": 12.474335670471191, "learning_rate": 3.771796178824951e-05, "loss": 2.7071, "step": 1181000 }, { "epoch": 2.4732474796530126, "grad_norm": 12.935812950134277, "learning_rate": 3.7712717531759216e-05, "loss": 2.7122, "step": 1181500 }, { "epoch": 2.474294135378638, "grad_norm": 14.973222732543945, "learning_rate": 3.770747327526893e-05, "loss": 2.7252, "step": 1182000 }, { "epoch": 2.4753407911042635, "grad_norm": 17.586681365966797, "learning_rate": 3.770222901877863e-05, "loss": 2.722, "step": 1182500 }, { "epoch": 2.476387446829889, "grad_norm": 15.198477745056152, "learning_rate": 3.769698476228834e-05, "loss": 2.707, "step": 1183000 }, { "epoch": 2.4774341025555144, "grad_norm": 13.204773902893066, "learning_rate": 3.7691740505798053e-05, "loss": 2.7021, "step": 1183500 }, { "epoch": 2.4784807582811403, "grad_norm": 12.9056978225708, "learning_rate": 3.7686496249307765e-05, "loss": 2.708, "step": 1184000 }, { "epoch": 2.479527414006766, "grad_norm": 14.15368938446045, "learning_rate": 3.768125199281747e-05, "loss": 2.709, "step": 1184500 }, { "epoch": 2.4805740697323913, "grad_norm": 14.285749435424805, "learning_rate": 3.767600773632717e-05, "loss": 2.7267, "step": 1185000 }, { "epoch": 2.4816207254580167, "grad_norm": 13.09260368347168, "learning_rate": 3.7670763479836884e-05, "loss": 2.7109, "step": 1185500 }, { "epoch": 2.482667381183642, "grad_norm": 13.987945556640625, "learning_rate": 3.766551922334659e-05, "loss": 2.7151, "step": 1186000 }, { "epoch": 2.4837140369092676, "grad_norm": 13.127507209777832, "learning_rate": 3.76602749668563e-05, "loss": 2.7303, "step": 1186500 }, { "epoch": 2.484760692634893, "grad_norm": 16.22881507873535, "learning_rate": 3.7655030710366004e-05, "loss": 2.6948, "step": 1187000 }, { "epoch": 2.4858073483605185, "grad_norm": 22.73581314086914, "learning_rate": 3.7649786453875715e-05, "loss": 2.6964, "step": 1187500 }, { "epoch": 2.486854004086144, "grad_norm": 11.451228141784668, "learning_rate": 3.7644542197385426e-05, "loss": 2.6914, "step": 1188000 }, { "epoch": 2.4879006598117694, "grad_norm": 12.808219909667969, "learning_rate": 3.763929794089514e-05, "loss": 2.7191, "step": 1188500 }, { "epoch": 2.488947315537395, "grad_norm": 13.449833869934082, "learning_rate": 3.763405368440484e-05, "loss": 2.7109, "step": 1189000 }, { "epoch": 2.4899939712630204, "grad_norm": 13.800847053527832, "learning_rate": 3.762880942791455e-05, "loss": 2.7072, "step": 1189500 }, { "epoch": 2.491040626988646, "grad_norm": 14.282458305358887, "learning_rate": 3.762356517142426e-05, "loss": 2.7071, "step": 1190000 }, { "epoch": 2.4920872827142713, "grad_norm": 13.920588493347168, "learning_rate": 3.761832091493397e-05, "loss": 2.7155, "step": 1190500 }, { "epoch": 2.4931339384398967, "grad_norm": 11.081530570983887, "learning_rate": 3.761307665844367e-05, "loss": 2.7264, "step": 1191000 }, { "epoch": 2.494180594165522, "grad_norm": 12.048199653625488, "learning_rate": 3.760783240195338e-05, "loss": 2.6918, "step": 1191500 }, { "epoch": 2.4952272498911476, "grad_norm": 15.557369232177734, "learning_rate": 3.7602588145463094e-05, "loss": 2.7096, "step": 1192000 }, { "epoch": 2.496273905616773, "grad_norm": 12.657456398010254, "learning_rate": 3.75973438889728e-05, "loss": 2.7054, "step": 1192500 }, { "epoch": 2.4973205613423985, "grad_norm": 14.283748626708984, "learning_rate": 3.759209963248251e-05, "loss": 2.7018, "step": 1193000 }, { "epoch": 2.498367217068024, "grad_norm": 11.831854820251465, "learning_rate": 3.7586855375992214e-05, "loss": 2.6988, "step": 1193500 }, { "epoch": 2.49941387279365, "grad_norm": 10.94542407989502, "learning_rate": 3.7581611119501925e-05, "loss": 2.7013, "step": 1194000 }, { "epoch": 2.5004605285192754, "grad_norm": 16.074647903442383, "learning_rate": 3.757636686301163e-05, "loss": 2.717, "step": 1194500 }, { "epoch": 2.501507184244901, "grad_norm": 17.981428146362305, "learning_rate": 3.757112260652134e-05, "loss": 2.7128, "step": 1195000 }, { "epoch": 2.5025538399705263, "grad_norm": 12.985294342041016, "learning_rate": 3.7565878350031045e-05, "loss": 2.7069, "step": 1195500 }, { "epoch": 2.5036004956961517, "grad_norm": 14.948684692382812, "learning_rate": 3.7560634093540756e-05, "loss": 2.7112, "step": 1196000 }, { "epoch": 2.504647151421777, "grad_norm": 12.445347785949707, "learning_rate": 3.755538983705047e-05, "loss": 2.7134, "step": 1196500 }, { "epoch": 2.5056938071474026, "grad_norm": 14.257808685302734, "learning_rate": 3.755014558056017e-05, "loss": 2.7151, "step": 1197000 }, { "epoch": 2.506740462873028, "grad_norm": 13.385129928588867, "learning_rate": 3.754490132406988e-05, "loss": 2.7191, "step": 1197500 }, { "epoch": 2.5077871185986536, "grad_norm": 12.289680480957031, "learning_rate": 3.753965706757959e-05, "loss": 2.6876, "step": 1198000 }, { "epoch": 2.508833774324279, "grad_norm": 13.23396110534668, "learning_rate": 3.75344128110893e-05, "loss": 2.7014, "step": 1198500 }, { "epoch": 2.5098804300499045, "grad_norm": 14.957157135009766, "learning_rate": 3.7529168554599e-05, "loss": 2.6874, "step": 1199000 }, { "epoch": 2.51092708577553, "grad_norm": 18.38215446472168, "learning_rate": 3.752392429810871e-05, "loss": 2.7116, "step": 1199500 }, { "epoch": 2.5119737415011554, "grad_norm": 13.649070739746094, "learning_rate": 3.751868004161842e-05, "loss": 2.7069, "step": 1200000 }, { "epoch": 2.513020397226781, "grad_norm": 18.31827735900879, "learning_rate": 3.751343578512813e-05, "loss": 2.7055, "step": 1200500 }, { "epoch": 2.5140670529524067, "grad_norm": 14.557238578796387, "learning_rate": 3.750819152863784e-05, "loss": 2.7132, "step": 1201000 }, { "epoch": 2.515113708678032, "grad_norm": 13.049874305725098, "learning_rate": 3.750294727214755e-05, "loss": 2.6974, "step": 1201500 }, { "epoch": 2.5161603644036576, "grad_norm": 16.485137939453125, "learning_rate": 3.7497703015657255e-05, "loss": 2.71, "step": 1202000 }, { "epoch": 2.517207020129283, "grad_norm": 13.839595794677734, "learning_rate": 3.749245875916696e-05, "loss": 2.7191, "step": 1202500 }, { "epoch": 2.5182536758549086, "grad_norm": 16.42652702331543, "learning_rate": 3.748721450267667e-05, "loss": 2.7233, "step": 1203000 }, { "epoch": 2.519300331580534, "grad_norm": 14.5693941116333, "learning_rate": 3.7481970246186375e-05, "loss": 2.7087, "step": 1203500 }, { "epoch": 2.5203469873061595, "grad_norm": 14.0714111328125, "learning_rate": 3.7476725989696086e-05, "loss": 2.7083, "step": 1204000 }, { "epoch": 2.521393643031785, "grad_norm": 12.263132095336914, "learning_rate": 3.747148173320579e-05, "loss": 2.7171, "step": 1204500 }, { "epoch": 2.5224402987574104, "grad_norm": 11.358232498168945, "learning_rate": 3.74662374767155e-05, "loss": 2.7189, "step": 1205000 }, { "epoch": 2.523486954483036, "grad_norm": 13.139362335205078, "learning_rate": 3.746099322022521e-05, "loss": 2.6993, "step": 1205500 }, { "epoch": 2.5245336102086613, "grad_norm": 13.882486343383789, "learning_rate": 3.745574896373492e-05, "loss": 2.692, "step": 1206000 }, { "epoch": 2.5255802659342867, "grad_norm": 14.032849311828613, "learning_rate": 3.745050470724463e-05, "loss": 2.6968, "step": 1206500 }, { "epoch": 2.526626921659912, "grad_norm": 14.281942367553711, "learning_rate": 3.744526045075434e-05, "loss": 2.6997, "step": 1207000 }, { "epoch": 2.5276735773855377, "grad_norm": 11.452289581298828, "learning_rate": 3.744001619426404e-05, "loss": 2.7102, "step": 1207500 }, { "epoch": 2.528720233111163, "grad_norm": 13.830314636230469, "learning_rate": 3.743477193777375e-05, "loss": 2.6862, "step": 1208000 }, { "epoch": 2.5297668888367886, "grad_norm": 21.011430740356445, "learning_rate": 3.742952768128346e-05, "loss": 2.6864, "step": 1208500 }, { "epoch": 2.530813544562414, "grad_norm": 14.912256240844727, "learning_rate": 3.742428342479317e-05, "loss": 2.7176, "step": 1209000 }, { "epoch": 2.5318602002880395, "grad_norm": 12.822266578674316, "learning_rate": 3.741903916830288e-05, "loss": 2.7071, "step": 1209500 }, { "epoch": 2.532906856013665, "grad_norm": 14.53126049041748, "learning_rate": 3.7413794911812585e-05, "loss": 2.7044, "step": 1210000 }, { "epoch": 2.5339535117392904, "grad_norm": 13.967884063720703, "learning_rate": 3.7408550655322296e-05, "loss": 2.7018, "step": 1210500 }, { "epoch": 2.535000167464916, "grad_norm": 15.07646369934082, "learning_rate": 3.7403306398832e-05, "loss": 2.6904, "step": 1211000 }, { "epoch": 2.5360468231905413, "grad_norm": 13.263459205627441, "learning_rate": 3.739806214234171e-05, "loss": 2.6983, "step": 1211500 }, { "epoch": 2.537093478916167, "grad_norm": 13.580118179321289, "learning_rate": 3.7392817885851416e-05, "loss": 2.6912, "step": 1212000 }, { "epoch": 2.5381401346417927, "grad_norm": 11.532090187072754, "learning_rate": 3.738757362936113e-05, "loss": 2.6945, "step": 1212500 }, { "epoch": 2.539186790367418, "grad_norm": 12.899067878723145, "learning_rate": 3.738232937287083e-05, "loss": 2.7168, "step": 1213000 }, { "epoch": 2.5402334460930436, "grad_norm": 14.952959060668945, "learning_rate": 3.737708511638054e-05, "loss": 2.7143, "step": 1213500 }, { "epoch": 2.541280101818669, "grad_norm": 13.08449649810791, "learning_rate": 3.737184085989025e-05, "loss": 2.7036, "step": 1214000 }, { "epoch": 2.5423267575442945, "grad_norm": 12.56120491027832, "learning_rate": 3.736659660339996e-05, "loss": 2.6932, "step": 1214500 }, { "epoch": 2.54337341326992, "grad_norm": 12.67249870300293, "learning_rate": 3.736135234690967e-05, "loss": 2.6948, "step": 1215000 }, { "epoch": 2.5444200689955454, "grad_norm": 11.352704048156738, "learning_rate": 3.735610809041937e-05, "loss": 2.7136, "step": 1215500 }, { "epoch": 2.545466724721171, "grad_norm": 12.415910720825195, "learning_rate": 3.7350863833929084e-05, "loss": 2.7107, "step": 1216000 }, { "epoch": 2.5465133804467963, "grad_norm": 13.527274131774902, "learning_rate": 3.734561957743879e-05, "loss": 2.6888, "step": 1216500 }, { "epoch": 2.5475600361724218, "grad_norm": 12.718008995056152, "learning_rate": 3.73403753209485e-05, "loss": 2.7161, "step": 1217000 }, { "epoch": 2.5486066918980472, "grad_norm": 11.146297454833984, "learning_rate": 3.7335131064458204e-05, "loss": 2.6984, "step": 1217500 }, { "epoch": 2.5496533476236727, "grad_norm": 11.864267349243164, "learning_rate": 3.7329886807967915e-05, "loss": 2.711, "step": 1218000 }, { "epoch": 2.5507000033492986, "grad_norm": 12.889538764953613, "learning_rate": 3.7324642551477626e-05, "loss": 2.6766, "step": 1218500 }, { "epoch": 2.551746659074924, "grad_norm": 12.007314682006836, "learning_rate": 3.731939829498734e-05, "loss": 2.6812, "step": 1219000 }, { "epoch": 2.5527933148005495, "grad_norm": 11.860067367553711, "learning_rate": 3.731415403849704e-05, "loss": 2.6999, "step": 1219500 }, { "epoch": 2.553839970526175, "grad_norm": 12.418346405029297, "learning_rate": 3.7308909782006745e-05, "loss": 2.6775, "step": 1220000 }, { "epoch": 2.5548866262518004, "grad_norm": 11.742897033691406, "learning_rate": 3.7303665525516456e-05, "loss": 2.6883, "step": 1220500 }, { "epoch": 2.555933281977426, "grad_norm": 12.653077125549316, "learning_rate": 3.729842126902616e-05, "loss": 2.7054, "step": 1221000 }, { "epoch": 2.5569799377030513, "grad_norm": 13.233705520629883, "learning_rate": 3.729317701253587e-05, "loss": 2.7011, "step": 1221500 }, { "epoch": 2.5580265934286768, "grad_norm": 11.697505950927734, "learning_rate": 3.7287932756045576e-05, "loss": 2.6941, "step": 1222000 }, { "epoch": 2.5590732491543022, "grad_norm": 13.825982093811035, "learning_rate": 3.728268849955529e-05, "loss": 2.7, "step": 1222500 }, { "epoch": 2.5601199048799277, "grad_norm": 14.042125701904297, "learning_rate": 3.7277444243065e-05, "loss": 2.7162, "step": 1223000 }, { "epoch": 2.561166560605553, "grad_norm": 13.555169105529785, "learning_rate": 3.727219998657471e-05, "loss": 2.6765, "step": 1223500 }, { "epoch": 2.5622132163311786, "grad_norm": 11.908496856689453, "learning_rate": 3.7266955730084414e-05, "loss": 2.7048, "step": 1224000 }, { "epoch": 2.563259872056804, "grad_norm": 12.707061767578125, "learning_rate": 3.7261711473594125e-05, "loss": 2.7044, "step": 1224500 }, { "epoch": 2.5643065277824295, "grad_norm": 12.73929500579834, "learning_rate": 3.725646721710383e-05, "loss": 2.7215, "step": 1225000 }, { "epoch": 2.565353183508055, "grad_norm": 14.50043773651123, "learning_rate": 3.7251222960613533e-05, "loss": 2.6948, "step": 1225500 }, { "epoch": 2.5663998392336804, "grad_norm": 14.422345161437988, "learning_rate": 3.7245978704123244e-05, "loss": 2.694, "step": 1226000 }, { "epoch": 2.567446494959306, "grad_norm": 12.251717567443848, "learning_rate": 3.7240734447632956e-05, "loss": 2.6795, "step": 1226500 }, { "epoch": 2.5684931506849313, "grad_norm": 12.50996208190918, "learning_rate": 3.723549019114267e-05, "loss": 2.6865, "step": 1227000 }, { "epoch": 2.569539806410557, "grad_norm": 13.501933097839355, "learning_rate": 3.723024593465237e-05, "loss": 2.7002, "step": 1227500 }, { "epoch": 2.5705864621361822, "grad_norm": 12.081031799316406, "learning_rate": 3.722500167816208e-05, "loss": 2.6983, "step": 1228000 }, { "epoch": 2.5716331178618077, "grad_norm": 11.752591133117676, "learning_rate": 3.7219757421671786e-05, "loss": 2.7089, "step": 1228500 }, { "epoch": 2.572679773587433, "grad_norm": 13.668989181518555, "learning_rate": 3.72145131651815e-05, "loss": 2.6888, "step": 1229000 }, { "epoch": 2.573726429313059, "grad_norm": 12.256275177001953, "learning_rate": 3.72092689086912e-05, "loss": 2.6875, "step": 1229500 }, { "epoch": 2.5747730850386845, "grad_norm": 11.541889190673828, "learning_rate": 3.720402465220091e-05, "loss": 2.6942, "step": 1230000 }, { "epoch": 2.57581974076431, "grad_norm": 11.26649284362793, "learning_rate": 3.719878039571062e-05, "loss": 2.6906, "step": 1230500 }, { "epoch": 2.5768663964899354, "grad_norm": 12.041967391967773, "learning_rate": 3.719353613922033e-05, "loss": 2.6823, "step": 1231000 }, { "epoch": 2.577913052215561, "grad_norm": 15.39585018157959, "learning_rate": 3.718829188273004e-05, "loss": 2.714, "step": 1231500 }, { "epoch": 2.5789597079411863, "grad_norm": 14.333049774169922, "learning_rate": 3.7183047626239744e-05, "loss": 2.7012, "step": 1232000 }, { "epoch": 2.580006363666812, "grad_norm": 13.88097095489502, "learning_rate": 3.7177803369749455e-05, "loss": 2.6862, "step": 1232500 }, { "epoch": 2.5810530193924373, "grad_norm": 13.299707412719727, "learning_rate": 3.717255911325916e-05, "loss": 2.681, "step": 1233000 }, { "epoch": 2.5820996751180627, "grad_norm": 11.71819019317627, "learning_rate": 3.716731485676887e-05, "loss": 2.6818, "step": 1233500 }, { "epoch": 2.583146330843688, "grad_norm": 16.088701248168945, "learning_rate": 3.7162070600278574e-05, "loss": 2.7123, "step": 1234000 }, { "epoch": 2.5841929865693136, "grad_norm": 11.91788101196289, "learning_rate": 3.7156826343788285e-05, "loss": 2.722, "step": 1234500 }, { "epoch": 2.585239642294939, "grad_norm": 13.590066909790039, "learning_rate": 3.715158208729799e-05, "loss": 2.6917, "step": 1235000 }, { "epoch": 2.5862862980205645, "grad_norm": 13.48992919921875, "learning_rate": 3.71463378308077e-05, "loss": 2.6906, "step": 1235500 }, { "epoch": 2.5873329537461904, "grad_norm": 11.175348281860352, "learning_rate": 3.714109357431741e-05, "loss": 2.676, "step": 1236000 }, { "epoch": 2.588379609471816, "grad_norm": 13.555557250976562, "learning_rate": 3.7135849317827116e-05, "loss": 2.6934, "step": 1236500 }, { "epoch": 2.5894262651974413, "grad_norm": 13.192166328430176, "learning_rate": 3.713060506133683e-05, "loss": 2.6907, "step": 1237000 }, { "epoch": 2.590472920923067, "grad_norm": 11.876919746398926, "learning_rate": 3.712536080484653e-05, "loss": 2.6978, "step": 1237500 }, { "epoch": 2.5915195766486923, "grad_norm": 12.698010444641113, "learning_rate": 3.712011654835624e-05, "loss": 2.6944, "step": 1238000 }, { "epoch": 2.5925662323743177, "grad_norm": 15.007696151733398, "learning_rate": 3.711487229186595e-05, "loss": 2.6952, "step": 1238500 }, { "epoch": 2.593612888099943, "grad_norm": 14.234160423278809, "learning_rate": 3.710962803537566e-05, "loss": 2.6916, "step": 1239000 }, { "epoch": 2.5946595438255686, "grad_norm": 13.14816951751709, "learning_rate": 3.710438377888536e-05, "loss": 2.685, "step": 1239500 }, { "epoch": 2.595706199551194, "grad_norm": 13.690735816955566, "learning_rate": 3.709913952239507e-05, "loss": 2.6843, "step": 1240000 }, { "epoch": 2.5967528552768195, "grad_norm": 15.038338661193848, "learning_rate": 3.7093895265904784e-05, "loss": 2.6974, "step": 1240500 }, { "epoch": 2.597799511002445, "grad_norm": 12.94905948638916, "learning_rate": 3.7088651009414496e-05, "loss": 2.7116, "step": 1241000 }, { "epoch": 2.5988461667280705, "grad_norm": 13.061127662658691, "learning_rate": 3.70834067529242e-05, "loss": 2.6923, "step": 1241500 }, { "epoch": 2.599892822453696, "grad_norm": 17.982189178466797, "learning_rate": 3.7078162496433904e-05, "loss": 2.6877, "step": 1242000 }, { "epoch": 2.6009394781793214, "grad_norm": 12.981221199035645, "learning_rate": 3.7072918239943615e-05, "loss": 2.7031, "step": 1242500 }, { "epoch": 2.601986133904947, "grad_norm": 13.295928001403809, "learning_rate": 3.706767398345332e-05, "loss": 2.7109, "step": 1243000 }, { "epoch": 2.6030327896305723, "grad_norm": 12.83735179901123, "learning_rate": 3.706242972696303e-05, "loss": 2.6837, "step": 1243500 }, { "epoch": 2.6040794453561977, "grad_norm": 14.118315696716309, "learning_rate": 3.705718547047274e-05, "loss": 2.6827, "step": 1244000 }, { "epoch": 2.605126101081823, "grad_norm": 13.38209342956543, "learning_rate": 3.705194121398245e-05, "loss": 2.6919, "step": 1244500 }, { "epoch": 2.6061727568074486, "grad_norm": 12.098398208618164, "learning_rate": 3.704669695749216e-05, "loss": 2.6942, "step": 1245000 }, { "epoch": 2.607219412533074, "grad_norm": 11.837861061096191, "learning_rate": 3.704145270100187e-05, "loss": 2.6743, "step": 1245500 }, { "epoch": 2.6082660682586996, "grad_norm": 13.159554481506348, "learning_rate": 3.703620844451157e-05, "loss": 2.6949, "step": 1246000 }, { "epoch": 2.609312723984325, "grad_norm": 13.246421813964844, "learning_rate": 3.7030964188021284e-05, "loss": 2.7037, "step": 1246500 }, { "epoch": 2.610359379709951, "grad_norm": 9.671916007995605, "learning_rate": 3.702571993153099e-05, "loss": 2.6957, "step": 1247000 }, { "epoch": 2.6114060354355764, "grad_norm": 12.288865089416504, "learning_rate": 3.702047567504069e-05, "loss": 2.6825, "step": 1247500 }, { "epoch": 2.612452691161202, "grad_norm": 17.522741317749023, "learning_rate": 3.70152314185504e-05, "loss": 2.6719, "step": 1248000 }, { "epoch": 2.6134993468868273, "grad_norm": 12.139548301696777, "learning_rate": 3.7009987162060114e-05, "loss": 2.6707, "step": 1248500 }, { "epoch": 2.6145460026124527, "grad_norm": 12.009151458740234, "learning_rate": 3.7004742905569825e-05, "loss": 2.6943, "step": 1249000 }, { "epoch": 2.615592658338078, "grad_norm": 12.688828468322754, "learning_rate": 3.699949864907953e-05, "loss": 2.6818, "step": 1249500 }, { "epoch": 2.6166393140637036, "grad_norm": 12.740991592407227, "learning_rate": 3.699425439258924e-05, "loss": 2.677, "step": 1250000 }, { "epoch": 2.617685969789329, "grad_norm": 14.621610641479492, "learning_rate": 3.6989010136098945e-05, "loss": 2.7056, "step": 1250500 }, { "epoch": 2.6187326255149546, "grad_norm": 15.376764297485352, "learning_rate": 3.6983765879608656e-05, "loss": 2.7013, "step": 1251000 }, { "epoch": 2.61977928124058, "grad_norm": 12.747124671936035, "learning_rate": 3.697852162311836e-05, "loss": 2.6907, "step": 1251500 }, { "epoch": 2.6208259369662055, "grad_norm": 12.984103202819824, "learning_rate": 3.697327736662807e-05, "loss": 2.6823, "step": 1252000 }, { "epoch": 2.621872592691831, "grad_norm": 13.617154121398926, "learning_rate": 3.6968033110137776e-05, "loss": 2.677, "step": 1252500 }, { "epoch": 2.6229192484174564, "grad_norm": 12.589401245117188, "learning_rate": 3.696278885364749e-05, "loss": 2.689, "step": 1253000 }, { "epoch": 2.6239659041430823, "grad_norm": 15.341510772705078, "learning_rate": 3.69575445971572e-05, "loss": 2.6904, "step": 1253500 }, { "epoch": 2.6250125598687077, "grad_norm": 11.761841773986816, "learning_rate": 3.69523003406669e-05, "loss": 2.6935, "step": 1254000 }, { "epoch": 2.626059215594333, "grad_norm": 14.096707344055176, "learning_rate": 3.694705608417661e-05, "loss": 2.6725, "step": 1254500 }, { "epoch": 2.6271058713199587, "grad_norm": 13.744123458862305, "learning_rate": 3.694181182768632e-05, "loss": 2.7022, "step": 1255000 }, { "epoch": 2.628152527045584, "grad_norm": 15.400134086608887, "learning_rate": 3.693656757119603e-05, "loss": 2.6842, "step": 1255500 }, { "epoch": 2.6291991827712096, "grad_norm": 13.036240577697754, "learning_rate": 3.693132331470573e-05, "loss": 2.7005, "step": 1256000 }, { "epoch": 2.630245838496835, "grad_norm": 12.832867622375488, "learning_rate": 3.6926079058215444e-05, "loss": 2.6969, "step": 1256500 }, { "epoch": 2.6312924942224605, "grad_norm": 12.4502592086792, "learning_rate": 3.692083480172515e-05, "loss": 2.6668, "step": 1257000 }, { "epoch": 2.632339149948086, "grad_norm": 13.332738876342773, "learning_rate": 3.691559054523486e-05, "loss": 2.7119, "step": 1257500 }, { "epoch": 2.6333858056737114, "grad_norm": 12.514691352844238, "learning_rate": 3.691034628874457e-05, "loss": 2.6842, "step": 1258000 }, { "epoch": 2.634432461399337, "grad_norm": 13.985730171203613, "learning_rate": 3.690510203225428e-05, "loss": 2.6868, "step": 1258500 }, { "epoch": 2.6354791171249623, "grad_norm": 13.317743301391602, "learning_rate": 3.6899857775763986e-05, "loss": 2.6871, "step": 1259000 }, { "epoch": 2.6365257728505878, "grad_norm": 13.121598243713379, "learning_rate": 3.689461351927369e-05, "loss": 2.693, "step": 1259500 }, { "epoch": 2.637572428576213, "grad_norm": 12.321128845214844, "learning_rate": 3.68893692627834e-05, "loss": 2.6952, "step": 1260000 }, { "epoch": 2.6386190843018387, "grad_norm": 13.525671005249023, "learning_rate": 3.6884125006293106e-05, "loss": 2.6942, "step": 1260500 }, { "epoch": 2.639665740027464, "grad_norm": 12.318087577819824, "learning_rate": 3.687888074980282e-05, "loss": 2.672, "step": 1261000 }, { "epoch": 2.6407123957530896, "grad_norm": 12.694138526916504, "learning_rate": 3.687363649331253e-05, "loss": 2.7042, "step": 1261500 }, { "epoch": 2.641759051478715, "grad_norm": 13.3847017288208, "learning_rate": 3.686839223682224e-05, "loss": 2.6967, "step": 1262000 }, { "epoch": 2.6428057072043405, "grad_norm": 11.563575744628906, "learning_rate": 3.686314798033194e-05, "loss": 2.684, "step": 1262500 }, { "epoch": 2.643852362929966, "grad_norm": 12.413064002990723, "learning_rate": 3.6857903723841654e-05, "loss": 2.6799, "step": 1263000 }, { "epoch": 2.6448990186555914, "grad_norm": 18.125635147094727, "learning_rate": 3.685265946735136e-05, "loss": 2.6972, "step": 1263500 }, { "epoch": 2.645945674381217, "grad_norm": 13.822951316833496, "learning_rate": 3.684741521086107e-05, "loss": 2.6756, "step": 1264000 }, { "epoch": 2.6469923301068428, "grad_norm": 13.00633716583252, "learning_rate": 3.6842170954370774e-05, "loss": 2.7017, "step": 1264500 }, { "epoch": 2.648038985832468, "grad_norm": 12.583943367004395, "learning_rate": 3.683692669788048e-05, "loss": 2.6776, "step": 1265000 }, { "epoch": 2.6490856415580937, "grad_norm": 12.445367813110352, "learning_rate": 3.683168244139019e-05, "loss": 2.6725, "step": 1265500 }, { "epoch": 2.650132297283719, "grad_norm": 12.620363235473633, "learning_rate": 3.68264381848999e-05, "loss": 2.6811, "step": 1266000 }, { "epoch": 2.6511789530093446, "grad_norm": 14.11981201171875, "learning_rate": 3.682119392840961e-05, "loss": 2.6758, "step": 1266500 }, { "epoch": 2.65222560873497, "grad_norm": 12.670991897583008, "learning_rate": 3.6815949671919316e-05, "loss": 2.6778, "step": 1267000 }, { "epoch": 2.6532722644605955, "grad_norm": 12.272502899169922, "learning_rate": 3.681070541542903e-05, "loss": 2.6757, "step": 1267500 }, { "epoch": 2.654318920186221, "grad_norm": 13.370963096618652, "learning_rate": 3.680546115893873e-05, "loss": 2.657, "step": 1268000 }, { "epoch": 2.6553655759118464, "grad_norm": 12.769838333129883, "learning_rate": 3.680021690244844e-05, "loss": 2.6806, "step": 1268500 }, { "epoch": 2.656412231637472, "grad_norm": 12.842864990234375, "learning_rate": 3.6794972645958147e-05, "loss": 2.6951, "step": 1269000 }, { "epoch": 2.6574588873630973, "grad_norm": 11.258556365966797, "learning_rate": 3.678972838946786e-05, "loss": 2.6747, "step": 1269500 }, { "epoch": 2.6585055430887228, "grad_norm": 13.05175495147705, "learning_rate": 3.678448413297756e-05, "loss": 2.6895, "step": 1270000 }, { "epoch": 2.6595521988143482, "grad_norm": 14.05122184753418, "learning_rate": 3.677923987648727e-05, "loss": 2.6839, "step": 1270500 }, { "epoch": 2.660598854539974, "grad_norm": 14.445016860961914, "learning_rate": 3.6773995619996984e-05, "loss": 2.688, "step": 1271000 }, { "epoch": 2.6616455102655996, "grad_norm": 12.407801628112793, "learning_rate": 3.676875136350669e-05, "loss": 2.6678, "step": 1271500 }, { "epoch": 2.662692165991225, "grad_norm": 13.771946907043457, "learning_rate": 3.67635071070164e-05, "loss": 2.6932, "step": 1272000 }, { "epoch": 2.6637388217168505, "grad_norm": 12.048415184020996, "learning_rate": 3.6758262850526104e-05, "loss": 2.6814, "step": 1272500 }, { "epoch": 2.664785477442476, "grad_norm": 12.329042434692383, "learning_rate": 3.6753018594035815e-05, "loss": 2.6752, "step": 1273000 }, { "epoch": 2.6658321331681014, "grad_norm": 12.483675003051758, "learning_rate": 3.674777433754552e-05, "loss": 2.7053, "step": 1273500 }, { "epoch": 2.666878788893727, "grad_norm": 12.299568176269531, "learning_rate": 3.674253008105523e-05, "loss": 2.6806, "step": 1274000 }, { "epoch": 2.6679254446193523, "grad_norm": 13.24109935760498, "learning_rate": 3.6737285824564935e-05, "loss": 2.6777, "step": 1274500 }, { "epoch": 2.668972100344978, "grad_norm": 13.947431564331055, "learning_rate": 3.6732041568074646e-05, "loss": 2.6969, "step": 1275000 }, { "epoch": 2.6700187560706032, "grad_norm": 12.464706420898438, "learning_rate": 3.672679731158436e-05, "loss": 2.6798, "step": 1275500 }, { "epoch": 2.6710654117962287, "grad_norm": 13.84419059753418, "learning_rate": 3.672155305509406e-05, "loss": 2.6809, "step": 1276000 }, { "epoch": 2.672112067521854, "grad_norm": 12.739603996276855, "learning_rate": 3.671630879860377e-05, "loss": 2.6749, "step": 1276500 }, { "epoch": 2.6731587232474796, "grad_norm": 13.185286521911621, "learning_rate": 3.6711064542113476e-05, "loss": 2.7035, "step": 1277000 }, { "epoch": 2.674205378973105, "grad_norm": 11.10234260559082, "learning_rate": 3.670582028562319e-05, "loss": 2.6904, "step": 1277500 }, { "epoch": 2.6752520346987305, "grad_norm": 12.292919158935547, "learning_rate": 3.670057602913289e-05, "loss": 2.6888, "step": 1278000 }, { "epoch": 2.676298690424356, "grad_norm": 14.657435417175293, "learning_rate": 3.66953317726426e-05, "loss": 2.6745, "step": 1278500 }, { "epoch": 2.6773453461499814, "grad_norm": 17.58580207824707, "learning_rate": 3.669008751615231e-05, "loss": 2.686, "step": 1279000 }, { "epoch": 2.678392001875607, "grad_norm": 13.371146202087402, "learning_rate": 3.6684843259662025e-05, "loss": 2.6882, "step": 1279500 }, { "epoch": 2.6794386576012323, "grad_norm": 13.485466003417969, "learning_rate": 3.667959900317173e-05, "loss": 2.6687, "step": 1280000 }, { "epoch": 2.680485313326858, "grad_norm": 13.83978271484375, "learning_rate": 3.667435474668144e-05, "loss": 2.6847, "step": 1280500 }, { "epoch": 2.6815319690524833, "grad_norm": 12.571144104003906, "learning_rate": 3.6669110490191145e-05, "loss": 2.6908, "step": 1281000 }, { "epoch": 2.6825786247781087, "grad_norm": 13.128884315490723, "learning_rate": 3.666386623370085e-05, "loss": 2.6863, "step": 1281500 }, { "epoch": 2.6836252805037346, "grad_norm": 13.521251678466797, "learning_rate": 3.665862197721056e-05, "loss": 2.6771, "step": 1282000 }, { "epoch": 2.68467193622936, "grad_norm": 13.105287551879883, "learning_rate": 3.6653377720720264e-05, "loss": 2.685, "step": 1282500 }, { "epoch": 2.6857185919549855, "grad_norm": 14.188323020935059, "learning_rate": 3.6648133464229975e-05, "loss": 2.6809, "step": 1283000 }, { "epoch": 2.686765247680611, "grad_norm": 13.385797500610352, "learning_rate": 3.6642889207739687e-05, "loss": 2.6901, "step": 1283500 }, { "epoch": 2.6878119034062364, "grad_norm": 12.91075611114502, "learning_rate": 3.66376449512494e-05, "loss": 2.6864, "step": 1284000 }, { "epoch": 2.688858559131862, "grad_norm": 26.295059204101562, "learning_rate": 3.66324006947591e-05, "loss": 2.7002, "step": 1284500 }, { "epoch": 2.6899052148574873, "grad_norm": 15.03218936920166, "learning_rate": 3.662715643826881e-05, "loss": 2.674, "step": 1285000 }, { "epoch": 2.690951870583113, "grad_norm": 15.99989128112793, "learning_rate": 3.662191218177852e-05, "loss": 2.6905, "step": 1285500 }, { "epoch": 2.6919985263087383, "grad_norm": 13.757613182067871, "learning_rate": 3.661666792528823e-05, "loss": 2.667, "step": 1286000 }, { "epoch": 2.6930451820343637, "grad_norm": 12.149504661560059, "learning_rate": 3.661142366879793e-05, "loss": 2.6826, "step": 1286500 }, { "epoch": 2.694091837759989, "grad_norm": 12.499720573425293, "learning_rate": 3.6606179412307644e-05, "loss": 2.6753, "step": 1287000 }, { "epoch": 2.6951384934856146, "grad_norm": 13.644685745239258, "learning_rate": 3.660093515581735e-05, "loss": 2.6804, "step": 1287500 }, { "epoch": 2.69618514921124, "grad_norm": 14.495174407958984, "learning_rate": 3.659569089932706e-05, "loss": 2.6705, "step": 1288000 }, { "epoch": 2.697231804936866, "grad_norm": 13.629533767700195, "learning_rate": 3.659044664283677e-05, "loss": 2.676, "step": 1288500 }, { "epoch": 2.6982784606624914, "grad_norm": 16.432783126831055, "learning_rate": 3.6585202386346475e-05, "loss": 2.6729, "step": 1289000 }, { "epoch": 2.699325116388117, "grad_norm": 13.747520446777344, "learning_rate": 3.6579958129856186e-05, "loss": 2.6883, "step": 1289500 }, { "epoch": 2.7003717721137424, "grad_norm": 12.959281921386719, "learning_rate": 3.657471387336589e-05, "loss": 2.6978, "step": 1290000 }, { "epoch": 2.701418427839368, "grad_norm": 11.542014122009277, "learning_rate": 3.65694696168756e-05, "loss": 2.6741, "step": 1290500 }, { "epoch": 2.7024650835649933, "grad_norm": 13.56898307800293, "learning_rate": 3.6564225360385305e-05, "loss": 2.6758, "step": 1291000 }, { "epoch": 2.7035117392906187, "grad_norm": 14.322827339172363, "learning_rate": 3.6558981103895016e-05, "loss": 2.696, "step": 1291500 }, { "epoch": 2.704558395016244, "grad_norm": 13.440912246704102, "learning_rate": 3.655373684740472e-05, "loss": 2.6821, "step": 1292000 }, { "epoch": 2.7056050507418696, "grad_norm": 11.21749210357666, "learning_rate": 3.654849259091443e-05, "loss": 2.6683, "step": 1292500 }, { "epoch": 2.706651706467495, "grad_norm": 13.49620246887207, "learning_rate": 3.654324833442414e-05, "loss": 2.6727, "step": 1293000 }, { "epoch": 2.7076983621931205, "grad_norm": 12.342226028442383, "learning_rate": 3.653800407793385e-05, "loss": 2.6808, "step": 1293500 }, { "epoch": 2.708745017918746, "grad_norm": 11.596118927001953, "learning_rate": 3.653275982144356e-05, "loss": 2.6706, "step": 1294000 }, { "epoch": 2.7097916736443715, "grad_norm": 12.690829277038574, "learning_rate": 3.652751556495326e-05, "loss": 2.6824, "step": 1294500 }, { "epoch": 2.710838329369997, "grad_norm": 12.75137996673584, "learning_rate": 3.6522271308462974e-05, "loss": 2.6739, "step": 1295000 }, { "epoch": 2.7118849850956224, "grad_norm": 12.467578887939453, "learning_rate": 3.651702705197268e-05, "loss": 2.6911, "step": 1295500 }, { "epoch": 2.712931640821248, "grad_norm": 14.199813842773438, "learning_rate": 3.651178279548239e-05, "loss": 2.6839, "step": 1296000 }, { "epoch": 2.7139782965468733, "grad_norm": 13.202264785766602, "learning_rate": 3.650653853899209e-05, "loss": 2.6724, "step": 1296500 }, { "epoch": 2.7150249522724987, "grad_norm": 13.447807312011719, "learning_rate": 3.650129428250181e-05, "loss": 2.6771, "step": 1297000 }, { "epoch": 2.716071607998124, "grad_norm": 12.642659187316895, "learning_rate": 3.6496050026011515e-05, "loss": 2.6772, "step": 1297500 }, { "epoch": 2.7171182637237496, "grad_norm": 12.180033683776855, "learning_rate": 3.6490805769521227e-05, "loss": 2.6681, "step": 1298000 }, { "epoch": 2.718164919449375, "grad_norm": 12.93069839477539, "learning_rate": 3.648556151303093e-05, "loss": 2.6866, "step": 1298500 }, { "epoch": 2.7192115751750006, "grad_norm": 13.819849014282227, "learning_rate": 3.6480317256540635e-05, "loss": 2.6909, "step": 1299000 }, { "epoch": 2.7202582309006265, "grad_norm": 13.155223846435547, "learning_rate": 3.6475073000050346e-05, "loss": 2.6809, "step": 1299500 }, { "epoch": 2.721304886626252, "grad_norm": 13.090933799743652, "learning_rate": 3.646982874356005e-05, "loss": 2.6701, "step": 1300000 }, { "epoch": 2.7223515423518774, "grad_norm": 17.236671447753906, "learning_rate": 3.646458448706976e-05, "loss": 2.6817, "step": 1300500 }, { "epoch": 2.723398198077503, "grad_norm": 13.762901306152344, "learning_rate": 3.645934023057947e-05, "loss": 2.6714, "step": 1301000 }, { "epoch": 2.7244448538031283, "grad_norm": 12.702441215515137, "learning_rate": 3.6454095974089184e-05, "loss": 2.6772, "step": 1301500 }, { "epoch": 2.7254915095287537, "grad_norm": 12.366311073303223, "learning_rate": 3.644885171759889e-05, "loss": 2.6785, "step": 1302000 }, { "epoch": 2.726538165254379, "grad_norm": 12.948163032531738, "learning_rate": 3.64436074611086e-05, "loss": 2.6677, "step": 1302500 }, { "epoch": 2.7275848209800047, "grad_norm": 12.084718704223633, "learning_rate": 3.6438363204618303e-05, "loss": 2.6757, "step": 1303000 }, { "epoch": 2.72863147670563, "grad_norm": 11.529264450073242, "learning_rate": 3.6433118948128015e-05, "loss": 2.6674, "step": 1303500 }, { "epoch": 2.7296781324312556, "grad_norm": 12.76980209350586, "learning_rate": 3.642787469163772e-05, "loss": 2.681, "step": 1304000 }, { "epoch": 2.730724788156881, "grad_norm": 11.67212963104248, "learning_rate": 3.642263043514742e-05, "loss": 2.6627, "step": 1304500 }, { "epoch": 2.7317714438825065, "grad_norm": 15.393999099731445, "learning_rate": 3.6417386178657134e-05, "loss": 2.667, "step": 1305000 }, { "epoch": 2.732818099608132, "grad_norm": 12.122395515441895, "learning_rate": 3.6412141922166845e-05, "loss": 2.6837, "step": 1305500 }, { "epoch": 2.733864755333758, "grad_norm": 12.957448959350586, "learning_rate": 3.6406897665676556e-05, "loss": 2.6512, "step": 1306000 }, { "epoch": 2.7349114110593833, "grad_norm": 13.061802864074707, "learning_rate": 3.640165340918626e-05, "loss": 2.6768, "step": 1306500 }, { "epoch": 2.7359580667850087, "grad_norm": 13.086813926696777, "learning_rate": 3.639640915269597e-05, "loss": 2.6785, "step": 1307000 }, { "epoch": 2.737004722510634, "grad_norm": 13.918526649475098, "learning_rate": 3.6391164896205676e-05, "loss": 2.6836, "step": 1307500 }, { "epoch": 2.7380513782362597, "grad_norm": 12.976778030395508, "learning_rate": 3.638592063971539e-05, "loss": 2.6646, "step": 1308000 }, { "epoch": 2.739098033961885, "grad_norm": 11.056340217590332, "learning_rate": 3.638067638322509e-05, "loss": 2.6854, "step": 1308500 }, { "epoch": 2.7401446896875106, "grad_norm": 13.033612251281738, "learning_rate": 3.63754321267348e-05, "loss": 2.6978, "step": 1309000 }, { "epoch": 2.741191345413136, "grad_norm": 13.419838905334473, "learning_rate": 3.637018787024451e-05, "loss": 2.6637, "step": 1309500 }, { "epoch": 2.7422380011387615, "grad_norm": 12.36245059967041, "learning_rate": 3.636494361375422e-05, "loss": 2.6931, "step": 1310000 }, { "epoch": 2.743284656864387, "grad_norm": 13.498393058776855, "learning_rate": 3.635969935726393e-05, "loss": 2.6705, "step": 1310500 }, { "epoch": 2.7443313125900124, "grad_norm": 14.436126708984375, "learning_rate": 3.635445510077363e-05, "loss": 2.6763, "step": 1311000 }, { "epoch": 2.745377968315638, "grad_norm": 13.838361740112305, "learning_rate": 3.6349210844283344e-05, "loss": 2.666, "step": 1311500 }, { "epoch": 2.7464246240412633, "grad_norm": 13.223987579345703, "learning_rate": 3.634396658779305e-05, "loss": 2.6744, "step": 1312000 }, { "epoch": 2.7474712797668888, "grad_norm": 11.203413963317871, "learning_rate": 3.633872233130276e-05, "loss": 2.6804, "step": 1312500 }, { "epoch": 2.748517935492514, "grad_norm": 16.418546676635742, "learning_rate": 3.6333478074812464e-05, "loss": 2.667, "step": 1313000 }, { "epoch": 2.7495645912181397, "grad_norm": 14.69150161743164, "learning_rate": 3.6328233818322175e-05, "loss": 2.6913, "step": 1313500 }, { "epoch": 2.750611246943765, "grad_norm": 13.780945777893066, "learning_rate": 3.632298956183188e-05, "loss": 2.6631, "step": 1314000 }, { "epoch": 2.7516579026693906, "grad_norm": 14.50162124633789, "learning_rate": 3.63177453053416e-05, "loss": 2.6551, "step": 1314500 }, { "epoch": 2.752704558395016, "grad_norm": 21.49770164489746, "learning_rate": 3.63125010488513e-05, "loss": 2.6536, "step": 1315000 }, { "epoch": 2.7537512141206415, "grad_norm": 12.619449615478516, "learning_rate": 3.630725679236101e-05, "loss": 2.6829, "step": 1315500 }, { "epoch": 2.754797869846267, "grad_norm": 11.301820755004883, "learning_rate": 3.630201253587072e-05, "loss": 2.669, "step": 1316000 }, { "epoch": 2.7558445255718924, "grad_norm": 15.097548484802246, "learning_rate": 3.629676827938042e-05, "loss": 2.6749, "step": 1316500 }, { "epoch": 2.7568911812975183, "grad_norm": 14.027655601501465, "learning_rate": 3.629152402289013e-05, "loss": 2.6665, "step": 1317000 }, { "epoch": 2.7579378370231438, "grad_norm": 12.639123916625977, "learning_rate": 3.628627976639984e-05, "loss": 2.6707, "step": 1317500 }, { "epoch": 2.7589844927487692, "grad_norm": 14.16265869140625, "learning_rate": 3.628103550990955e-05, "loss": 2.6621, "step": 1318000 }, { "epoch": 2.7600311484743947, "grad_norm": 13.915075302124023, "learning_rate": 3.627579125341926e-05, "loss": 2.6868, "step": 1318500 }, { "epoch": 2.76107780420002, "grad_norm": 14.411564826965332, "learning_rate": 3.627054699692897e-05, "loss": 2.6746, "step": 1319000 }, { "epoch": 2.7621244599256456, "grad_norm": 12.811457633972168, "learning_rate": 3.6265302740438674e-05, "loss": 2.6683, "step": 1319500 }, { "epoch": 2.763171115651271, "grad_norm": 14.759682655334473, "learning_rate": 3.6260058483948385e-05, "loss": 2.6721, "step": 1320000 }, { "epoch": 2.7642177713768965, "grad_norm": 15.616433143615723, "learning_rate": 3.625481422745809e-05, "loss": 2.6805, "step": 1320500 }, { "epoch": 2.765264427102522, "grad_norm": 13.42575454711914, "learning_rate": 3.62495699709678e-05, "loss": 2.6788, "step": 1321000 }, { "epoch": 2.7663110828281474, "grad_norm": 12.835250854492188, "learning_rate": 3.6244325714477505e-05, "loss": 2.6801, "step": 1321500 }, { "epoch": 2.767357738553773, "grad_norm": 12.50024127960205, "learning_rate": 3.623908145798721e-05, "loss": 2.6693, "step": 1322000 }, { "epoch": 2.7684043942793983, "grad_norm": 12.364184379577637, "learning_rate": 3.623383720149692e-05, "loss": 2.6942, "step": 1322500 }, { "epoch": 2.769451050005024, "grad_norm": 12.568343162536621, "learning_rate": 3.622859294500663e-05, "loss": 2.6743, "step": 1323000 }, { "epoch": 2.7704977057306497, "grad_norm": 14.693735122680664, "learning_rate": 3.622334868851634e-05, "loss": 2.6799, "step": 1323500 }, { "epoch": 2.771544361456275, "grad_norm": 11.450887680053711, "learning_rate": 3.621810443202605e-05, "loss": 2.6581, "step": 1324000 }, { "epoch": 2.7725910171819006, "grad_norm": 11.519224166870117, "learning_rate": 3.621286017553576e-05, "loss": 2.6588, "step": 1324500 }, { "epoch": 2.773637672907526, "grad_norm": 12.430878639221191, "learning_rate": 3.620761591904546e-05, "loss": 2.6807, "step": 1325000 }, { "epoch": 2.7746843286331515, "grad_norm": 14.386764526367188, "learning_rate": 3.620237166255517e-05, "loss": 2.6467, "step": 1325500 }, { "epoch": 2.775730984358777, "grad_norm": 13.156242370605469, "learning_rate": 3.619712740606488e-05, "loss": 2.6939, "step": 1326000 }, { "epoch": 2.7767776400844024, "grad_norm": 12.224178314208984, "learning_rate": 3.619188314957459e-05, "loss": 2.6682, "step": 1326500 }, { "epoch": 2.777824295810028, "grad_norm": 12.853728294372559, "learning_rate": 3.618663889308429e-05, "loss": 2.6785, "step": 1327000 }, { "epoch": 2.7788709515356533, "grad_norm": 12.590935707092285, "learning_rate": 3.6181394636594004e-05, "loss": 2.6711, "step": 1327500 }, { "epoch": 2.779917607261279, "grad_norm": 12.319830894470215, "learning_rate": 3.6176150380103715e-05, "loss": 2.6727, "step": 1328000 }, { "epoch": 2.7809642629869042, "grad_norm": 12.544235229492188, "learning_rate": 3.617090612361342e-05, "loss": 2.6642, "step": 1328500 }, { "epoch": 2.7820109187125297, "grad_norm": 13.0438871383667, "learning_rate": 3.616566186712313e-05, "loss": 2.685, "step": 1329000 }, { "epoch": 2.783057574438155, "grad_norm": 12.316332817077637, "learning_rate": 3.6160417610632835e-05, "loss": 2.6608, "step": 1329500 }, { "epoch": 2.7841042301637806, "grad_norm": 13.818482398986816, "learning_rate": 3.6155173354142546e-05, "loss": 2.6715, "step": 1330000 }, { "epoch": 2.785150885889406, "grad_norm": 13.0482177734375, "learning_rate": 3.614992909765225e-05, "loss": 2.6797, "step": 1330500 }, { "epoch": 2.7861975416150315, "grad_norm": 12.707328796386719, "learning_rate": 3.614468484116196e-05, "loss": 2.666, "step": 1331000 }, { "epoch": 2.787244197340657, "grad_norm": 12.306727409362793, "learning_rate": 3.6139440584671666e-05, "loss": 2.6727, "step": 1331500 }, { "epoch": 2.7882908530662824, "grad_norm": 13.56158447265625, "learning_rate": 3.613419632818138e-05, "loss": 2.6655, "step": 1332000 }, { "epoch": 2.789337508791908, "grad_norm": 13.200006484985352, "learning_rate": 3.612895207169109e-05, "loss": 2.6881, "step": 1332500 }, { "epoch": 2.7903841645175333, "grad_norm": 13.746410369873047, "learning_rate": 3.612370781520079e-05, "loss": 2.6685, "step": 1333000 }, { "epoch": 2.791430820243159, "grad_norm": 12.287256240844727, "learning_rate": 3.61184635587105e-05, "loss": 2.6715, "step": 1333500 }, { "epoch": 2.7924774759687847, "grad_norm": 12.229247093200684, "learning_rate": 3.611321930222021e-05, "loss": 2.6833, "step": 1334000 }, { "epoch": 2.79352413169441, "grad_norm": 12.976513862609863, "learning_rate": 3.610797504572992e-05, "loss": 2.6663, "step": 1334500 }, { "epoch": 2.7945707874200356, "grad_norm": 13.810970306396484, "learning_rate": 3.610273078923962e-05, "loss": 2.6794, "step": 1335000 }, { "epoch": 2.795617443145661, "grad_norm": 13.88492488861084, "learning_rate": 3.6097486532749334e-05, "loss": 2.6663, "step": 1335500 }, { "epoch": 2.7966640988712865, "grad_norm": 14.370013236999512, "learning_rate": 3.6092242276259045e-05, "loss": 2.652, "step": 1336000 }, { "epoch": 2.797710754596912, "grad_norm": 13.427461624145508, "learning_rate": 3.6086998019768756e-05, "loss": 2.6833, "step": 1336500 }, { "epoch": 2.7987574103225374, "grad_norm": 15.212730407714844, "learning_rate": 3.608175376327846e-05, "loss": 2.684, "step": 1337000 }, { "epoch": 2.799804066048163, "grad_norm": 11.199458122253418, "learning_rate": 3.607650950678817e-05, "loss": 2.6576, "step": 1337500 }, { "epoch": 2.8008507217737884, "grad_norm": 15.59489631652832, "learning_rate": 3.6071265250297876e-05, "loss": 2.6638, "step": 1338000 }, { "epoch": 2.801897377499414, "grad_norm": 11.426929473876953, "learning_rate": 3.606602099380758e-05, "loss": 2.6697, "step": 1338500 }, { "epoch": 2.8029440332250393, "grad_norm": 13.822359085083008, "learning_rate": 3.606077673731729e-05, "loss": 2.6689, "step": 1339000 }, { "epoch": 2.8039906889506647, "grad_norm": 11.58700180053711, "learning_rate": 3.6055532480826995e-05, "loss": 2.6713, "step": 1339500 }, { "epoch": 2.80503734467629, "grad_norm": 12.665946006774902, "learning_rate": 3.6050288224336706e-05, "loss": 2.6636, "step": 1340000 }, { "epoch": 2.8060840004019156, "grad_norm": 12.935935974121094, "learning_rate": 3.604504396784642e-05, "loss": 2.646, "step": 1340500 }, { "epoch": 2.8071306561275415, "grad_norm": 12.55661392211914, "learning_rate": 3.603979971135613e-05, "loss": 2.6692, "step": 1341000 }, { "epoch": 2.808177311853167, "grad_norm": 12.409212112426758, "learning_rate": 3.603455545486583e-05, "loss": 2.6826, "step": 1341500 }, { "epoch": 2.8092239675787924, "grad_norm": 14.580042839050293, "learning_rate": 3.6029311198375544e-05, "loss": 2.6755, "step": 1342000 }, { "epoch": 2.810270623304418, "grad_norm": 12.142247200012207, "learning_rate": 3.602406694188525e-05, "loss": 2.6757, "step": 1342500 }, { "epoch": 2.8113172790300434, "grad_norm": 15.455347061157227, "learning_rate": 3.601882268539496e-05, "loss": 2.6781, "step": 1343000 }, { "epoch": 2.812363934755669, "grad_norm": 13.930235862731934, "learning_rate": 3.6013578428904664e-05, "loss": 2.6637, "step": 1343500 }, { "epoch": 2.8134105904812943, "grad_norm": 14.752680778503418, "learning_rate": 3.600833417241437e-05, "loss": 2.684, "step": 1344000 }, { "epoch": 2.8144572462069197, "grad_norm": 13.494843482971191, "learning_rate": 3.600308991592408e-05, "loss": 2.6685, "step": 1344500 }, { "epoch": 2.815503901932545, "grad_norm": 12.555116653442383, "learning_rate": 3.599784565943379e-05, "loss": 2.6775, "step": 1345000 }, { "epoch": 2.8165505576581706, "grad_norm": 11.676010131835938, "learning_rate": 3.59926014029435e-05, "loss": 2.6634, "step": 1345500 }, { "epoch": 2.817597213383796, "grad_norm": 13.531482696533203, "learning_rate": 3.5987357146453206e-05, "loss": 2.6685, "step": 1346000 }, { "epoch": 2.8186438691094216, "grad_norm": 14.583084106445312, "learning_rate": 3.598211288996292e-05, "loss": 2.6642, "step": 1346500 }, { "epoch": 2.819690524835047, "grad_norm": 14.141303062438965, "learning_rate": 3.597686863347262e-05, "loss": 2.6849, "step": 1347000 }, { "epoch": 2.8207371805606725, "grad_norm": 14.772875785827637, "learning_rate": 3.597162437698233e-05, "loss": 2.6745, "step": 1347500 }, { "epoch": 2.821783836286298, "grad_norm": 14.683125495910645, "learning_rate": 3.5966380120492036e-05, "loss": 2.6755, "step": 1348000 }, { "epoch": 2.8228304920119234, "grad_norm": 12.936901092529297, "learning_rate": 3.596113586400175e-05, "loss": 2.6642, "step": 1348500 }, { "epoch": 2.823877147737549, "grad_norm": 12.141587257385254, "learning_rate": 3.595589160751145e-05, "loss": 2.6643, "step": 1349000 }, { "epoch": 2.8249238034631743, "grad_norm": 14.569124221801758, "learning_rate": 3.595064735102116e-05, "loss": 2.6618, "step": 1349500 }, { "epoch": 2.8259704591887997, "grad_norm": 11.500959396362305, "learning_rate": 3.5945403094530874e-05, "loss": 2.6503, "step": 1350000 }, { "epoch": 2.827017114914425, "grad_norm": 15.974715232849121, "learning_rate": 3.594015883804058e-05, "loss": 2.6527, "step": 1350500 }, { "epoch": 2.8280637706400507, "grad_norm": 12.519254684448242, "learning_rate": 3.593491458155029e-05, "loss": 2.6888, "step": 1351000 }, { "epoch": 2.8291104263656766, "grad_norm": 14.041034698486328, "learning_rate": 3.5929670325059994e-05, "loss": 2.6616, "step": 1351500 }, { "epoch": 2.830157082091302, "grad_norm": 13.295101165771484, "learning_rate": 3.5924426068569705e-05, "loss": 2.639, "step": 1352000 }, { "epoch": 2.8312037378169275, "grad_norm": 12.511035919189453, "learning_rate": 3.591918181207941e-05, "loss": 2.658, "step": 1352500 }, { "epoch": 2.832250393542553, "grad_norm": 13.322244644165039, "learning_rate": 3.591393755558912e-05, "loss": 2.6727, "step": 1353000 }, { "epoch": 2.8332970492681784, "grad_norm": 12.482440948486328, "learning_rate": 3.590869329909883e-05, "loss": 2.6411, "step": 1353500 }, { "epoch": 2.834343704993804, "grad_norm": 12.273538589477539, "learning_rate": 3.590344904260854e-05, "loss": 2.6561, "step": 1354000 }, { "epoch": 2.8353903607194293, "grad_norm": 14.19212818145752, "learning_rate": 3.5898204786118246e-05, "loss": 2.6561, "step": 1354500 }, { "epoch": 2.8364370164450547, "grad_norm": 13.507472038269043, "learning_rate": 3.589296052962796e-05, "loss": 2.6579, "step": 1355000 }, { "epoch": 2.83748367217068, "grad_norm": 12.340697288513184, "learning_rate": 3.588771627313766e-05, "loss": 2.6743, "step": 1355500 }, { "epoch": 2.8385303278963057, "grad_norm": 13.292706489562988, "learning_rate": 3.5882472016647366e-05, "loss": 2.6801, "step": 1356000 }, { "epoch": 2.839576983621931, "grad_norm": 15.20327377319336, "learning_rate": 3.587722776015708e-05, "loss": 2.6476, "step": 1356500 }, { "epoch": 2.8406236393475566, "grad_norm": 14.352019309997559, "learning_rate": 3.587198350366678e-05, "loss": 2.6798, "step": 1357000 }, { "epoch": 2.841670295073182, "grad_norm": 12.035134315490723, "learning_rate": 3.586673924717649e-05, "loss": 2.6744, "step": 1357500 }, { "epoch": 2.8427169507988075, "grad_norm": 12.021844863891602, "learning_rate": 3.5861494990686204e-05, "loss": 2.69, "step": 1358000 }, { "epoch": 2.8437636065244334, "grad_norm": 12.26498031616211, "learning_rate": 3.5856250734195915e-05, "loss": 2.6664, "step": 1358500 }, { "epoch": 2.844810262250059, "grad_norm": 14.302916526794434, "learning_rate": 3.585100647770562e-05, "loss": 2.6672, "step": 1359000 }, { "epoch": 2.8458569179756843, "grad_norm": 11.667933464050293, "learning_rate": 3.584576222121533e-05, "loss": 2.6691, "step": 1359500 }, { "epoch": 2.8469035737013098, "grad_norm": 14.21396541595459, "learning_rate": 3.5840517964725034e-05, "loss": 2.6571, "step": 1360000 }, { "epoch": 2.847950229426935, "grad_norm": 11.662317276000977, "learning_rate": 3.5835273708234746e-05, "loss": 2.6531, "step": 1360500 }, { "epoch": 2.8489968851525607, "grad_norm": 11.651690483093262, "learning_rate": 3.583002945174445e-05, "loss": 2.6596, "step": 1361000 }, { "epoch": 2.850043540878186, "grad_norm": 14.118688583374023, "learning_rate": 3.5824785195254154e-05, "loss": 2.664, "step": 1361500 }, { "epoch": 2.8510901966038116, "grad_norm": 14.250229835510254, "learning_rate": 3.5819540938763865e-05, "loss": 2.6429, "step": 1362000 }, { "epoch": 2.852136852329437, "grad_norm": 13.94018840789795, "learning_rate": 3.5814296682273576e-05, "loss": 2.6663, "step": 1362500 }, { "epoch": 2.8531835080550625, "grad_norm": 14.085872650146484, "learning_rate": 3.580905242578329e-05, "loss": 2.6335, "step": 1363000 }, { "epoch": 2.854230163780688, "grad_norm": 12.469164848327637, "learning_rate": 3.580380816929299e-05, "loss": 2.664, "step": 1363500 }, { "epoch": 2.8552768195063134, "grad_norm": 15.33053207397461, "learning_rate": 3.57985639128027e-05, "loss": 2.6609, "step": 1364000 }, { "epoch": 2.856323475231939, "grad_norm": 19.329193115234375, "learning_rate": 3.579331965631241e-05, "loss": 2.6609, "step": 1364500 }, { "epoch": 2.8573701309575643, "grad_norm": 12.567380905151367, "learning_rate": 3.578807539982212e-05, "loss": 2.6886, "step": 1365000 }, { "epoch": 2.8584167866831898, "grad_norm": 11.788150787353516, "learning_rate": 3.578283114333182e-05, "loss": 2.6551, "step": 1365500 }, { "epoch": 2.8594634424088152, "grad_norm": 13.872835159301758, "learning_rate": 3.5777586886841534e-05, "loss": 2.6501, "step": 1366000 }, { "epoch": 2.8605100981344407, "grad_norm": 14.025126457214355, "learning_rate": 3.577234263035124e-05, "loss": 2.6654, "step": 1366500 }, { "epoch": 2.861556753860066, "grad_norm": 13.066683769226074, "learning_rate": 3.576709837386095e-05, "loss": 2.6814, "step": 1367000 }, { "epoch": 2.8626034095856916, "grad_norm": 13.016444206237793, "learning_rate": 3.576185411737066e-05, "loss": 2.6706, "step": 1367500 }, { "epoch": 2.863650065311317, "grad_norm": 11.443623542785645, "learning_rate": 3.5756609860880364e-05, "loss": 2.6736, "step": 1368000 }, { "epoch": 2.8646967210369425, "grad_norm": 13.447019577026367, "learning_rate": 3.5751365604390075e-05, "loss": 2.6639, "step": 1368500 }, { "epoch": 2.8657433767625684, "grad_norm": 13.838130950927734, "learning_rate": 3.574612134789978e-05, "loss": 2.6752, "step": 1369000 }, { "epoch": 2.866790032488194, "grad_norm": 12.36806583404541, "learning_rate": 3.574087709140949e-05, "loss": 2.6408, "step": 1369500 }, { "epoch": 2.8678366882138193, "grad_norm": 13.557252883911133, "learning_rate": 3.5735632834919195e-05, "loss": 2.6491, "step": 1370000 }, { "epoch": 2.8688833439394448, "grad_norm": 12.184203147888184, "learning_rate": 3.5730388578428906e-05, "loss": 2.6407, "step": 1370500 }, { "epoch": 2.8699299996650702, "grad_norm": 13.914843559265137, "learning_rate": 3.572514432193862e-05, "loss": 2.6621, "step": 1371000 }, { "epoch": 2.8709766553906957, "grad_norm": 13.128222465515137, "learning_rate": 3.571990006544833e-05, "loss": 2.6575, "step": 1371500 }, { "epoch": 2.872023311116321, "grad_norm": 15.00064468383789, "learning_rate": 3.571465580895803e-05, "loss": 2.6654, "step": 1372000 }, { "epoch": 2.8730699668419466, "grad_norm": 14.194578170776367, "learning_rate": 3.570941155246774e-05, "loss": 2.6612, "step": 1372500 }, { "epoch": 2.874116622567572, "grad_norm": 11.99166488647461, "learning_rate": 3.570416729597745e-05, "loss": 2.665, "step": 1373000 }, { "epoch": 2.8751632782931975, "grad_norm": 14.086075782775879, "learning_rate": 3.569892303948715e-05, "loss": 2.6575, "step": 1373500 }, { "epoch": 2.876209934018823, "grad_norm": 13.956181526184082, "learning_rate": 3.569367878299686e-05, "loss": 2.6596, "step": 1374000 }, { "epoch": 2.8772565897444484, "grad_norm": 12.57048225402832, "learning_rate": 3.568843452650657e-05, "loss": 2.6651, "step": 1374500 }, { "epoch": 2.878303245470074, "grad_norm": 13.908008575439453, "learning_rate": 3.568319027001628e-05, "loss": 2.6696, "step": 1375000 }, { "epoch": 2.8793499011956993, "grad_norm": 13.039437294006348, "learning_rate": 3.567794601352599e-05, "loss": 2.6489, "step": 1375500 }, { "epoch": 2.8803965569213252, "grad_norm": 11.14185619354248, "learning_rate": 3.56727017570357e-05, "loss": 2.6595, "step": 1376000 }, { "epoch": 2.8814432126469507, "grad_norm": 14.748930931091309, "learning_rate": 3.5667457500545405e-05, "loss": 2.6662, "step": 1376500 }, { "epoch": 2.882489868372576, "grad_norm": 13.02464771270752, "learning_rate": 3.5662213244055116e-05, "loss": 2.6476, "step": 1377000 }, { "epoch": 2.8835365240982016, "grad_norm": 12.353012084960938, "learning_rate": 3.565696898756482e-05, "loss": 2.6639, "step": 1377500 }, { "epoch": 2.884583179823827, "grad_norm": 12.324892044067383, "learning_rate": 3.5651724731074525e-05, "loss": 2.655, "step": 1378000 }, { "epoch": 2.8856298355494525, "grad_norm": 13.974347114562988, "learning_rate": 3.5646480474584236e-05, "loss": 2.6573, "step": 1378500 }, { "epoch": 2.886676491275078, "grad_norm": 13.06608772277832, "learning_rate": 3.564123621809394e-05, "loss": 2.6485, "step": 1379000 }, { "epoch": 2.8877231470007034, "grad_norm": 12.24650764465332, "learning_rate": 3.563599196160365e-05, "loss": 2.6503, "step": 1379500 }, { "epoch": 2.888769802726329, "grad_norm": 15.243658065795898, "learning_rate": 3.563074770511336e-05, "loss": 2.6561, "step": 1380000 }, { "epoch": 2.8898164584519543, "grad_norm": 12.46841812133789, "learning_rate": 3.5625503448623074e-05, "loss": 2.6666, "step": 1380500 }, { "epoch": 2.89086311417758, "grad_norm": 12.369853019714355, "learning_rate": 3.562025919213278e-05, "loss": 2.6566, "step": 1381000 }, { "epoch": 2.8919097699032053, "grad_norm": 11.779280662536621, "learning_rate": 3.561501493564249e-05, "loss": 2.6575, "step": 1381500 }, { "epoch": 2.8929564256288307, "grad_norm": 11.735363960266113, "learning_rate": 3.560977067915219e-05, "loss": 2.6578, "step": 1382000 }, { "epoch": 2.894003081354456, "grad_norm": 12.318353652954102, "learning_rate": 3.5604526422661904e-05, "loss": 2.6694, "step": 1382500 }, { "epoch": 2.8950497370800816, "grad_norm": 10.812775611877441, "learning_rate": 3.559928216617161e-05, "loss": 2.6554, "step": 1383000 }, { "epoch": 2.896096392805707, "grad_norm": 12.86011028289795, "learning_rate": 3.559403790968132e-05, "loss": 2.6722, "step": 1383500 }, { "epoch": 2.8971430485313325, "grad_norm": 12.6443510055542, "learning_rate": 3.5588793653191024e-05, "loss": 2.6593, "step": 1384000 }, { "epoch": 2.898189704256958, "grad_norm": 14.6143217086792, "learning_rate": 3.5583549396700735e-05, "loss": 2.6615, "step": 1384500 }, { "epoch": 2.8992363599825834, "grad_norm": 14.211135864257812, "learning_rate": 3.5578305140210446e-05, "loss": 2.6633, "step": 1385000 }, { "epoch": 2.900283015708209, "grad_norm": 12.30131721496582, "learning_rate": 3.557306088372015e-05, "loss": 2.6579, "step": 1385500 }, { "epoch": 2.9013296714338344, "grad_norm": 12.34343433380127, "learning_rate": 3.556781662722986e-05, "loss": 2.6557, "step": 1386000 }, { "epoch": 2.9023763271594603, "grad_norm": 12.100115776062012, "learning_rate": 3.5562572370739566e-05, "loss": 2.6598, "step": 1386500 }, { "epoch": 2.9034229828850857, "grad_norm": 11.916877746582031, "learning_rate": 3.555732811424928e-05, "loss": 2.6508, "step": 1387000 }, { "epoch": 2.904469638610711, "grad_norm": 12.550360679626465, "learning_rate": 3.555208385775898e-05, "loss": 2.6454, "step": 1387500 }, { "epoch": 2.9055162943363366, "grad_norm": 14.587700843811035, "learning_rate": 3.554683960126869e-05, "loss": 2.6557, "step": 1388000 }, { "epoch": 2.906562950061962, "grad_norm": 12.207273483276367, "learning_rate": 3.55415953447784e-05, "loss": 2.6313, "step": 1388500 }, { "epoch": 2.9076096057875875, "grad_norm": 13.226744651794434, "learning_rate": 3.5536351088288114e-05, "loss": 2.6525, "step": 1389000 }, { "epoch": 2.908656261513213, "grad_norm": 17.845043182373047, "learning_rate": 3.553110683179782e-05, "loss": 2.6443, "step": 1389500 }, { "epoch": 2.9097029172388384, "grad_norm": 14.693665504455566, "learning_rate": 3.552586257530752e-05, "loss": 2.6692, "step": 1390000 }, { "epoch": 2.910749572964464, "grad_norm": 14.695712089538574, "learning_rate": 3.5520618318817234e-05, "loss": 2.6393, "step": 1390500 }, { "epoch": 2.9117962286900894, "grad_norm": 13.439088821411133, "learning_rate": 3.551537406232694e-05, "loss": 2.6751, "step": 1391000 }, { "epoch": 2.912842884415715, "grad_norm": 12.66231632232666, "learning_rate": 3.551012980583665e-05, "loss": 2.651, "step": 1391500 }, { "epoch": 2.9138895401413403, "grad_norm": 13.34632396697998, "learning_rate": 3.5504885549346354e-05, "loss": 2.641, "step": 1392000 }, { "epoch": 2.9149361958669657, "grad_norm": 13.40145492553711, "learning_rate": 3.5499641292856065e-05, "loss": 2.6543, "step": 1392500 }, { "epoch": 2.9159828515925916, "grad_norm": 12.423115730285645, "learning_rate": 3.5494397036365776e-05, "loss": 2.6581, "step": 1393000 }, { "epoch": 2.917029507318217, "grad_norm": 16.628999710083008, "learning_rate": 3.548915277987549e-05, "loss": 2.6585, "step": 1393500 }, { "epoch": 2.9180761630438425, "grad_norm": 18.223737716674805, "learning_rate": 3.548390852338519e-05, "loss": 2.6691, "step": 1394000 }, { "epoch": 2.919122818769468, "grad_norm": 13.232864379882812, "learning_rate": 3.54786642668949e-05, "loss": 2.6593, "step": 1394500 }, { "epoch": 2.9201694744950935, "grad_norm": 13.854496955871582, "learning_rate": 3.547342001040461e-05, "loss": 2.6266, "step": 1395000 }, { "epoch": 2.921216130220719, "grad_norm": 15.287409782409668, "learning_rate": 3.546817575391431e-05, "loss": 2.6798, "step": 1395500 }, { "epoch": 2.9222627859463444, "grad_norm": 15.227381706237793, "learning_rate": 3.546293149742402e-05, "loss": 2.6538, "step": 1396000 }, { "epoch": 2.92330944167197, "grad_norm": 14.432832717895508, "learning_rate": 3.5457687240933726e-05, "loss": 2.6664, "step": 1396500 }, { "epoch": 2.9243560973975953, "grad_norm": 12.722892761230469, "learning_rate": 3.545244298444344e-05, "loss": 2.66, "step": 1397000 }, { "epoch": 2.9254027531232207, "grad_norm": 15.638559341430664, "learning_rate": 3.544719872795315e-05, "loss": 2.6555, "step": 1397500 }, { "epoch": 2.926449408848846, "grad_norm": 12.348729133605957, "learning_rate": 3.544195447146286e-05, "loss": 2.6497, "step": 1398000 }, { "epoch": 2.9274960645744716, "grad_norm": 12.521516799926758, "learning_rate": 3.5436710214972564e-05, "loss": 2.6516, "step": 1398500 }, { "epoch": 2.928542720300097, "grad_norm": 14.10930061340332, "learning_rate": 3.5431465958482275e-05, "loss": 2.6525, "step": 1399000 }, { "epoch": 2.9295893760257226, "grad_norm": 12.001261711120605, "learning_rate": 3.542622170199198e-05, "loss": 2.6464, "step": 1399500 }, { "epoch": 2.930636031751348, "grad_norm": 13.75955867767334, "learning_rate": 3.542097744550169e-05, "loss": 2.6561, "step": 1400000 }, { "epoch": 2.9316826874769735, "grad_norm": 13.67823600769043, "learning_rate": 3.5415733189011395e-05, "loss": 2.6343, "step": 1400500 }, { "epoch": 2.932729343202599, "grad_norm": 13.532024383544922, "learning_rate": 3.54104889325211e-05, "loss": 2.6578, "step": 1401000 }, { "epoch": 2.9337759989282244, "grad_norm": 12.228747367858887, "learning_rate": 3.540524467603081e-05, "loss": 2.6542, "step": 1401500 }, { "epoch": 2.93482265465385, "grad_norm": 12.940788269042969, "learning_rate": 3.540000041954052e-05, "loss": 2.6549, "step": 1402000 }, { "epoch": 2.9358693103794753, "grad_norm": 13.313591957092285, "learning_rate": 3.539475616305023e-05, "loss": 2.6308, "step": 1402500 }, { "epoch": 2.9369159661051008, "grad_norm": 13.138005256652832, "learning_rate": 3.5389511906559937e-05, "loss": 2.6686, "step": 1403000 }, { "epoch": 2.937962621830726, "grad_norm": 11.465307235717773, "learning_rate": 3.538426765006965e-05, "loss": 2.6538, "step": 1403500 }, { "epoch": 2.939009277556352, "grad_norm": 11.932931900024414, "learning_rate": 3.537902339357935e-05, "loss": 2.6525, "step": 1404000 }, { "epoch": 2.9400559332819776, "grad_norm": 13.318303108215332, "learning_rate": 3.537377913708906e-05, "loss": 2.643, "step": 1404500 }, { "epoch": 2.941102589007603, "grad_norm": 12.807415962219238, "learning_rate": 3.536853488059877e-05, "loss": 2.646, "step": 1405000 }, { "epoch": 2.9421492447332285, "grad_norm": 14.985448837280273, "learning_rate": 3.536329062410848e-05, "loss": 2.6543, "step": 1405500 }, { "epoch": 2.943195900458854, "grad_norm": 15.05138111114502, "learning_rate": 3.535804636761819e-05, "loss": 2.6673, "step": 1406000 }, { "epoch": 2.9442425561844794, "grad_norm": 12.830159187316895, "learning_rate": 3.5352802111127894e-05, "loss": 2.6425, "step": 1406500 }, { "epoch": 2.945289211910105, "grad_norm": 14.672673225402832, "learning_rate": 3.5347557854637605e-05, "loss": 2.643, "step": 1407000 }, { "epoch": 2.9463358676357303, "grad_norm": 13.634390830993652, "learning_rate": 3.534231359814731e-05, "loss": 2.6504, "step": 1407500 }, { "epoch": 2.9473825233613558, "grad_norm": 13.006481170654297, "learning_rate": 3.533706934165702e-05, "loss": 2.6713, "step": 1408000 }, { "epoch": 2.948429179086981, "grad_norm": 13.679216384887695, "learning_rate": 3.5331825085166725e-05, "loss": 2.6375, "step": 1408500 }, { "epoch": 2.9494758348126067, "grad_norm": 12.207679748535156, "learning_rate": 3.5326580828676436e-05, "loss": 2.6394, "step": 1409000 }, { "epoch": 2.950522490538232, "grad_norm": 15.540706634521484, "learning_rate": 3.532133657218614e-05, "loss": 2.6631, "step": 1409500 }, { "epoch": 2.9515691462638576, "grad_norm": 12.756446838378906, "learning_rate": 3.531609231569585e-05, "loss": 2.6481, "step": 1410000 }, { "epoch": 2.9526158019894835, "grad_norm": 12.67619800567627, "learning_rate": 3.531084805920556e-05, "loss": 2.6559, "step": 1410500 }, { "epoch": 2.953662457715109, "grad_norm": 15.738232612609863, "learning_rate": 3.530560380271527e-05, "loss": 2.6699, "step": 1411000 }, { "epoch": 2.9547091134407344, "grad_norm": 11.38714599609375, "learning_rate": 3.530035954622498e-05, "loss": 2.6529, "step": 1411500 }, { "epoch": 2.95575576916636, "grad_norm": 11.861553192138672, "learning_rate": 3.529511528973468e-05, "loss": 2.6324, "step": 1412000 }, { "epoch": 2.9568024248919853, "grad_norm": 14.519867897033691, "learning_rate": 3.528987103324439e-05, "loss": 2.6196, "step": 1412500 }, { "epoch": 2.9578490806176108, "grad_norm": 14.592073440551758, "learning_rate": 3.52846267767541e-05, "loss": 2.6415, "step": 1413000 }, { "epoch": 2.958895736343236, "grad_norm": 12.246954917907715, "learning_rate": 3.527938252026381e-05, "loss": 2.6464, "step": 1413500 }, { "epoch": 2.9599423920688617, "grad_norm": 14.951851844787598, "learning_rate": 3.527413826377351e-05, "loss": 2.653, "step": 1414000 }, { "epoch": 2.960989047794487, "grad_norm": 26.791683197021484, "learning_rate": 3.5268894007283224e-05, "loss": 2.6693, "step": 1414500 }, { "epoch": 2.9620357035201126, "grad_norm": 13.332387924194336, "learning_rate": 3.5263649750792935e-05, "loss": 2.6564, "step": 1415000 }, { "epoch": 2.963082359245738, "grad_norm": 13.150799751281738, "learning_rate": 3.5258405494302646e-05, "loss": 2.6268, "step": 1415500 }, { "epoch": 2.9641290149713635, "grad_norm": 13.107942581176758, "learning_rate": 3.525316123781235e-05, "loss": 2.6582, "step": 1416000 }, { "epoch": 2.965175670696989, "grad_norm": 14.79456615447998, "learning_rate": 3.524791698132206e-05, "loss": 2.648, "step": 1416500 }, { "epoch": 2.9662223264226144, "grad_norm": 12.479812622070312, "learning_rate": 3.5242672724831765e-05, "loss": 2.6679, "step": 1417000 }, { "epoch": 2.96726898214824, "grad_norm": 13.129070281982422, "learning_rate": 3.5237428468341477e-05, "loss": 2.6525, "step": 1417500 }, { "epoch": 2.9683156378738653, "grad_norm": 12.257540702819824, "learning_rate": 3.523218421185118e-05, "loss": 2.6556, "step": 1418000 }, { "epoch": 2.9693622935994908, "grad_norm": 14.132040977478027, "learning_rate": 3.5226939955360885e-05, "loss": 2.6679, "step": 1418500 }, { "epoch": 2.9704089493251162, "grad_norm": 15.23808765411377, "learning_rate": 3.5221695698870596e-05, "loss": 2.6529, "step": 1419000 }, { "epoch": 2.9714556050507417, "grad_norm": 13.511484146118164, "learning_rate": 3.521645144238031e-05, "loss": 2.6488, "step": 1419500 }, { "epoch": 2.972502260776367, "grad_norm": 15.468308448791504, "learning_rate": 3.521120718589002e-05, "loss": 2.6681, "step": 1420000 }, { "epoch": 2.9735489165019926, "grad_norm": 14.384754180908203, "learning_rate": 3.520596292939972e-05, "loss": 2.6409, "step": 1420500 }, { "epoch": 2.974595572227618, "grad_norm": 12.621716499328613, "learning_rate": 3.5200718672909434e-05, "loss": 2.6445, "step": 1421000 }, { "epoch": 2.975642227953244, "grad_norm": 13.053337097167969, "learning_rate": 3.519547441641914e-05, "loss": 2.6432, "step": 1421500 }, { "epoch": 2.9766888836788694, "grad_norm": 15.014132499694824, "learning_rate": 3.519023015992885e-05, "loss": 2.6516, "step": 1422000 }, { "epoch": 2.977735539404495, "grad_norm": 13.55465030670166, "learning_rate": 3.5184985903438553e-05, "loss": 2.6449, "step": 1422500 }, { "epoch": 2.9787821951301203, "grad_norm": 12.822999954223633, "learning_rate": 3.5179741646948265e-05, "loss": 2.661, "step": 1423000 }, { "epoch": 2.979828850855746, "grad_norm": 14.712160110473633, "learning_rate": 3.517449739045797e-05, "loss": 2.6503, "step": 1423500 }, { "epoch": 2.9808755065813712, "grad_norm": 12.260327339172363, "learning_rate": 3.516925313396768e-05, "loss": 2.6308, "step": 1424000 }, { "epoch": 2.9819221623069967, "grad_norm": 12.942323684692383, "learning_rate": 3.516400887747739e-05, "loss": 2.6477, "step": 1424500 }, { "epoch": 2.982968818032622, "grad_norm": 13.312253952026367, "learning_rate": 3.5158764620987095e-05, "loss": 2.6427, "step": 1425000 }, { "epoch": 2.9840154737582476, "grad_norm": 12.976978302001953, "learning_rate": 3.5153520364496806e-05, "loss": 2.6507, "step": 1425500 }, { "epoch": 2.985062129483873, "grad_norm": 12.97326946258545, "learning_rate": 3.514827610800651e-05, "loss": 2.6494, "step": 1426000 }, { "epoch": 2.9861087852094985, "grad_norm": 14.909210205078125, "learning_rate": 3.514303185151622e-05, "loss": 2.6559, "step": 1426500 }, { "epoch": 2.987155440935124, "grad_norm": 12.493230819702148, "learning_rate": 3.5137787595025926e-05, "loss": 2.6406, "step": 1427000 }, { "epoch": 2.9882020966607494, "grad_norm": 14.082497596740723, "learning_rate": 3.513254333853564e-05, "loss": 2.6381, "step": 1427500 }, { "epoch": 2.9892487523863753, "grad_norm": 12.769926071166992, "learning_rate": 3.512729908204535e-05, "loss": 2.643, "step": 1428000 }, { "epoch": 2.990295408112001, "grad_norm": 15.929224967956543, "learning_rate": 3.512205482555506e-05, "loss": 2.6621, "step": 1428500 }, { "epoch": 2.9913420638376262, "grad_norm": 14.387359619140625, "learning_rate": 3.5116810569064764e-05, "loss": 2.6705, "step": 1429000 }, { "epoch": 2.9923887195632517, "grad_norm": 15.59206771850586, "learning_rate": 3.511156631257447e-05, "loss": 2.6451, "step": 1429500 }, { "epoch": 2.993435375288877, "grad_norm": 12.813943862915039, "learning_rate": 3.510632205608418e-05, "loss": 2.6492, "step": 1430000 }, { "epoch": 2.9944820310145026, "grad_norm": 12.617176055908203, "learning_rate": 3.510107779959388e-05, "loss": 2.6482, "step": 1430500 }, { "epoch": 2.995528686740128, "grad_norm": 14.557257652282715, "learning_rate": 3.5095833543103594e-05, "loss": 2.6572, "step": 1431000 }, { "epoch": 2.9965753424657535, "grad_norm": 12.842638969421387, "learning_rate": 3.50905892866133e-05, "loss": 2.6308, "step": 1431500 }, { "epoch": 2.997621998191379, "grad_norm": 12.232345581054688, "learning_rate": 3.508534503012301e-05, "loss": 2.6504, "step": 1432000 }, { "epoch": 2.9986686539170044, "grad_norm": 13.492412567138672, "learning_rate": 3.508010077363272e-05, "loss": 2.6423, "step": 1432500 }, { "epoch": 2.99971530964263, "grad_norm": 12.16695499420166, "learning_rate": 3.507485651714243e-05, "loss": 2.6397, "step": 1433000 }, { "epoch": 3.0007619653682553, "grad_norm": 11.887638092041016, "learning_rate": 3.5069612260652136e-05, "loss": 2.6437, "step": 1433500 }, { "epoch": 3.001808621093881, "grad_norm": 11.729731559753418, "learning_rate": 3.506436800416185e-05, "loss": 2.6392, "step": 1434000 }, { "epoch": 3.0028552768195063, "grad_norm": 12.781086921691895, "learning_rate": 3.505912374767155e-05, "loss": 2.6301, "step": 1434500 }, { "epoch": 3.0039019325451317, "grad_norm": 13.012895584106445, "learning_rate": 3.5053879491181256e-05, "loss": 2.6265, "step": 1435000 }, { "epoch": 3.004948588270757, "grad_norm": 12.854278564453125, "learning_rate": 3.504863523469097e-05, "loss": 2.6635, "step": 1435500 }, { "epoch": 3.0059952439963826, "grad_norm": 13.670636177062988, "learning_rate": 3.504339097820067e-05, "loss": 2.6457, "step": 1436000 }, { "epoch": 3.007041899722008, "grad_norm": 13.25955867767334, "learning_rate": 3.503814672171038e-05, "loss": 2.6357, "step": 1436500 }, { "epoch": 3.0080885554476335, "grad_norm": 11.57446575164795, "learning_rate": 3.5032902465220093e-05, "loss": 2.635, "step": 1437000 }, { "epoch": 3.009135211173259, "grad_norm": 15.847558975219727, "learning_rate": 3.5027658208729805e-05, "loss": 2.6277, "step": 1437500 }, { "epoch": 3.010181866898885, "grad_norm": 13.843395233154297, "learning_rate": 3.502241395223951e-05, "loss": 2.6341, "step": 1438000 }, { "epoch": 3.0112285226245104, "grad_norm": 13.505261421203613, "learning_rate": 3.501716969574922e-05, "loss": 2.6559, "step": 1438500 }, { "epoch": 3.012275178350136, "grad_norm": 13.566131591796875, "learning_rate": 3.5011925439258924e-05, "loss": 2.6073, "step": 1439000 }, { "epoch": 3.0133218340757613, "grad_norm": 14.479096412658691, "learning_rate": 3.5006681182768635e-05, "loss": 2.6346, "step": 1439500 }, { "epoch": 3.0143684898013867, "grad_norm": 12.723515510559082, "learning_rate": 3.500143692627834e-05, "loss": 2.6533, "step": 1440000 }, { "epoch": 3.015415145527012, "grad_norm": 15.649518013000488, "learning_rate": 3.4996192669788044e-05, "loss": 2.6408, "step": 1440500 }, { "epoch": 3.0164618012526376, "grad_norm": 12.794143676757812, "learning_rate": 3.4990948413297755e-05, "loss": 2.6177, "step": 1441000 }, { "epoch": 3.017508456978263, "grad_norm": 15.866337776184082, "learning_rate": 3.4985704156807466e-05, "loss": 2.6228, "step": 1441500 }, { "epoch": 3.0185551127038885, "grad_norm": 16.496614456176758, "learning_rate": 3.498045990031718e-05, "loss": 2.6327, "step": 1442000 }, { "epoch": 3.019601768429514, "grad_norm": 14.295076370239258, "learning_rate": 3.497521564382688e-05, "loss": 2.6447, "step": 1442500 }, { "epoch": 3.0206484241551395, "grad_norm": 15.71428108215332, "learning_rate": 3.496997138733659e-05, "loss": 2.6472, "step": 1443000 }, { "epoch": 3.021695079880765, "grad_norm": 14.599620819091797, "learning_rate": 3.49647271308463e-05, "loss": 2.6554, "step": 1443500 }, { "epoch": 3.0227417356063904, "grad_norm": 13.700776100158691, "learning_rate": 3.495948287435601e-05, "loss": 2.6392, "step": 1444000 }, { "epoch": 3.023788391332016, "grad_norm": 13.438300132751465, "learning_rate": 3.495423861786571e-05, "loss": 2.6574, "step": 1444500 }, { "epoch": 3.0248350470576413, "grad_norm": 14.384981155395508, "learning_rate": 3.494899436137542e-05, "loss": 2.629, "step": 1445000 }, { "epoch": 3.0258817027832667, "grad_norm": 12.250630378723145, "learning_rate": 3.4943750104885134e-05, "loss": 2.6573, "step": 1445500 }, { "epoch": 3.026928358508892, "grad_norm": 14.176665306091309, "learning_rate": 3.4938505848394845e-05, "loss": 2.651, "step": 1446000 }, { "epoch": 3.027975014234518, "grad_norm": 13.158941268920898, "learning_rate": 3.493326159190455e-05, "loss": 2.6425, "step": 1446500 }, { "epoch": 3.0290216699601435, "grad_norm": 10.699337005615234, "learning_rate": 3.4928017335414254e-05, "loss": 2.6269, "step": 1447000 }, { "epoch": 3.030068325685769, "grad_norm": 12.567200660705566, "learning_rate": 3.4922773078923965e-05, "loss": 2.646, "step": 1447500 }, { "epoch": 3.0311149814113945, "grad_norm": 13.579790115356445, "learning_rate": 3.491752882243367e-05, "loss": 2.6309, "step": 1448000 }, { "epoch": 3.03216163713702, "grad_norm": 11.123578071594238, "learning_rate": 3.491228456594338e-05, "loss": 2.6212, "step": 1448500 }, { "epoch": 3.0332082928626454, "grad_norm": 12.600113868713379, "learning_rate": 3.4907040309453085e-05, "loss": 2.6152, "step": 1449000 }, { "epoch": 3.034254948588271, "grad_norm": 12.780572891235352, "learning_rate": 3.4901796052962796e-05, "loss": 2.6456, "step": 1449500 }, { "epoch": 3.0353016043138963, "grad_norm": 11.754828453063965, "learning_rate": 3.489655179647251e-05, "loss": 2.633, "step": 1450000 }, { "epoch": 3.0363482600395217, "grad_norm": 14.250818252563477, "learning_rate": 3.489130753998222e-05, "loss": 2.6303, "step": 1450500 }, { "epoch": 3.037394915765147, "grad_norm": 12.866534233093262, "learning_rate": 3.488606328349192e-05, "loss": 2.6421, "step": 1451000 }, { "epoch": 3.0384415714907727, "grad_norm": 13.61889362335205, "learning_rate": 3.4880819027001633e-05, "loss": 2.6474, "step": 1451500 }, { "epoch": 3.039488227216398, "grad_norm": 12.416779518127441, "learning_rate": 3.487557477051134e-05, "loss": 2.652, "step": 1452000 }, { "epoch": 3.0405348829420236, "grad_norm": 11.131457328796387, "learning_rate": 3.487033051402104e-05, "loss": 2.6328, "step": 1452500 }, { "epoch": 3.041581538667649, "grad_norm": 12.02763843536377, "learning_rate": 3.486508625753075e-05, "loss": 2.6321, "step": 1453000 }, { "epoch": 3.0426281943932745, "grad_norm": 13.334866523742676, "learning_rate": 3.485984200104046e-05, "loss": 2.6315, "step": 1453500 }, { "epoch": 3.0436748501189, "grad_norm": 13.950715065002441, "learning_rate": 3.485459774455017e-05, "loss": 2.638, "step": 1454000 }, { "epoch": 3.0447215058445254, "grad_norm": 12.634522438049316, "learning_rate": 3.484935348805988e-05, "loss": 2.6501, "step": 1454500 }, { "epoch": 3.045768161570151, "grad_norm": 12.736137390136719, "learning_rate": 3.484410923156959e-05, "loss": 2.6566, "step": 1455000 }, { "epoch": 3.0468148172957767, "grad_norm": 14.539774894714355, "learning_rate": 3.4838864975079295e-05, "loss": 2.6169, "step": 1455500 }, { "epoch": 3.047861473021402, "grad_norm": 12.960761070251465, "learning_rate": 3.4833620718589006e-05, "loss": 2.641, "step": 1456000 }, { "epoch": 3.0489081287470277, "grad_norm": 15.461224555969238, "learning_rate": 3.482837646209871e-05, "loss": 2.639, "step": 1456500 }, { "epoch": 3.049954784472653, "grad_norm": 13.60828971862793, "learning_rate": 3.482313220560842e-05, "loss": 2.6449, "step": 1457000 }, { "epoch": 3.0510014401982786, "grad_norm": 12.661030769348145, "learning_rate": 3.4817887949118126e-05, "loss": 2.6389, "step": 1457500 }, { "epoch": 3.052048095923904, "grad_norm": 14.974385261535645, "learning_rate": 3.481264369262783e-05, "loss": 2.6209, "step": 1458000 }, { "epoch": 3.0530947516495295, "grad_norm": 15.232998847961426, "learning_rate": 3.480739943613754e-05, "loss": 2.6355, "step": 1458500 }, { "epoch": 3.054141407375155, "grad_norm": 13.001158714294434, "learning_rate": 3.480215517964725e-05, "loss": 2.6347, "step": 1459000 }, { "epoch": 3.0551880631007804, "grad_norm": 12.379890441894531, "learning_rate": 3.479691092315696e-05, "loss": 2.6394, "step": 1459500 }, { "epoch": 3.056234718826406, "grad_norm": 12.440519332885742, "learning_rate": 3.479166666666667e-05, "loss": 2.659, "step": 1460000 }, { "epoch": 3.0572813745520313, "grad_norm": 15.309429168701172, "learning_rate": 3.478642241017638e-05, "loss": 2.6214, "step": 1460500 }, { "epoch": 3.0583280302776568, "grad_norm": 12.441878318786621, "learning_rate": 3.478117815368608e-05, "loss": 2.6366, "step": 1461000 }, { "epoch": 3.059374686003282, "grad_norm": 14.018706321716309, "learning_rate": 3.4775933897195794e-05, "loss": 2.6308, "step": 1461500 }, { "epoch": 3.0604213417289077, "grad_norm": 12.224913597106934, "learning_rate": 3.47706896407055e-05, "loss": 2.6353, "step": 1462000 }, { "epoch": 3.061467997454533, "grad_norm": 13.912290573120117, "learning_rate": 3.476544538421521e-05, "loss": 2.6398, "step": 1462500 }, { "epoch": 3.0625146531801586, "grad_norm": 14.702667236328125, "learning_rate": 3.476020112772492e-05, "loss": 2.6284, "step": 1463000 }, { "epoch": 3.063561308905784, "grad_norm": 17.26244354248047, "learning_rate": 3.4754956871234625e-05, "loss": 2.6297, "step": 1463500 }, { "epoch": 3.06460796463141, "grad_norm": 11.019371032714844, "learning_rate": 3.4749712614744336e-05, "loss": 2.6446, "step": 1464000 }, { "epoch": 3.0656546203570354, "grad_norm": 14.129372596740723, "learning_rate": 3.474446835825404e-05, "loss": 2.6459, "step": 1464500 }, { "epoch": 3.066701276082661, "grad_norm": 12.363240242004395, "learning_rate": 3.473922410176375e-05, "loss": 2.6472, "step": 1465000 }, { "epoch": 3.0677479318082863, "grad_norm": 12.559479713439941, "learning_rate": 3.4733979845273456e-05, "loss": 2.6361, "step": 1465500 }, { "epoch": 3.0687945875339118, "grad_norm": 15.342777252197266, "learning_rate": 3.472873558878317e-05, "loss": 2.6451, "step": 1466000 }, { "epoch": 3.0698412432595372, "grad_norm": 14.668294906616211, "learning_rate": 3.472349133229287e-05, "loss": 2.6551, "step": 1466500 }, { "epoch": 3.0708878989851627, "grad_norm": 14.272711753845215, "learning_rate": 3.471824707580258e-05, "loss": 2.62, "step": 1467000 }, { "epoch": 3.071934554710788, "grad_norm": 12.119131088256836, "learning_rate": 3.471300281931229e-05, "loss": 2.6578, "step": 1467500 }, { "epoch": 3.0729812104364136, "grad_norm": 11.892217636108398, "learning_rate": 3.4707758562822004e-05, "loss": 2.6518, "step": 1468000 }, { "epoch": 3.074027866162039, "grad_norm": 13.633427619934082, "learning_rate": 3.470251430633171e-05, "loss": 2.6264, "step": 1468500 }, { "epoch": 3.0750745218876645, "grad_norm": 13.834212303161621, "learning_rate": 3.469727004984141e-05, "loss": 2.6663, "step": 1469000 }, { "epoch": 3.07612117761329, "grad_norm": 13.87516975402832, "learning_rate": 3.4692025793351124e-05, "loss": 2.6386, "step": 1469500 }, { "epoch": 3.0771678333389154, "grad_norm": 18.052860260009766, "learning_rate": 3.468678153686083e-05, "loss": 2.6641, "step": 1470000 }, { "epoch": 3.078214489064541, "grad_norm": 13.63599967956543, "learning_rate": 3.468153728037054e-05, "loss": 2.6209, "step": 1470500 }, { "epoch": 3.0792611447901663, "grad_norm": 12.191777229309082, "learning_rate": 3.4676293023880244e-05, "loss": 2.6399, "step": 1471000 }, { "epoch": 3.080307800515792, "grad_norm": 13.844764709472656, "learning_rate": 3.4671048767389955e-05, "loss": 2.6265, "step": 1471500 }, { "epoch": 3.0813544562414172, "grad_norm": 11.84648323059082, "learning_rate": 3.4665804510899666e-05, "loss": 2.6224, "step": 1472000 }, { "epoch": 3.0824011119670427, "grad_norm": 13.440937042236328, "learning_rate": 3.466056025440938e-05, "loss": 2.6369, "step": 1472500 }, { "epoch": 3.0834477676926686, "grad_norm": 11.130380630493164, "learning_rate": 3.465531599791908e-05, "loss": 2.6573, "step": 1473000 }, { "epoch": 3.084494423418294, "grad_norm": 12.740432739257812, "learning_rate": 3.465007174142879e-05, "loss": 2.6192, "step": 1473500 }, { "epoch": 3.0855410791439195, "grad_norm": 12.27241039276123, "learning_rate": 3.4644827484938496e-05, "loss": 2.6477, "step": 1474000 }, { "epoch": 3.086587734869545, "grad_norm": 12.770792007446289, "learning_rate": 3.46395832284482e-05, "loss": 2.6281, "step": 1474500 }, { "epoch": 3.0876343905951704, "grad_norm": 13.426055908203125, "learning_rate": 3.463433897195791e-05, "loss": 2.6375, "step": 1475000 }, { "epoch": 3.088681046320796, "grad_norm": 13.933332443237305, "learning_rate": 3.4629094715467616e-05, "loss": 2.6404, "step": 1475500 }, { "epoch": 3.0897277020464213, "grad_norm": 14.94378662109375, "learning_rate": 3.462385045897733e-05, "loss": 2.6337, "step": 1476000 }, { "epoch": 3.090774357772047, "grad_norm": 13.75509262084961, "learning_rate": 3.461860620248704e-05, "loss": 2.6317, "step": 1476500 }, { "epoch": 3.0918210134976722, "grad_norm": 14.13331413269043, "learning_rate": 3.461336194599675e-05, "loss": 2.6424, "step": 1477000 }, { "epoch": 3.0928676692232977, "grad_norm": 12.50110912322998, "learning_rate": 3.4608117689506454e-05, "loss": 2.6431, "step": 1477500 }, { "epoch": 3.093914324948923, "grad_norm": 16.072546005249023, "learning_rate": 3.4602873433016165e-05, "loss": 2.6365, "step": 1478000 }, { "epoch": 3.0949609806745486, "grad_norm": 13.25659465789795, "learning_rate": 3.459762917652587e-05, "loss": 2.6318, "step": 1478500 }, { "epoch": 3.096007636400174, "grad_norm": 14.38725757598877, "learning_rate": 3.459238492003558e-05, "loss": 2.6308, "step": 1479000 }, { "epoch": 3.0970542921257995, "grad_norm": 13.953865051269531, "learning_rate": 3.4587140663545284e-05, "loss": 2.6446, "step": 1479500 }, { "epoch": 3.098100947851425, "grad_norm": 17.598468780517578, "learning_rate": 3.4581896407054996e-05, "loss": 2.6181, "step": 1480000 }, { "epoch": 3.0991476035770504, "grad_norm": 15.82093620300293, "learning_rate": 3.457665215056471e-05, "loss": 2.6285, "step": 1480500 }, { "epoch": 3.100194259302676, "grad_norm": 13.749335289001465, "learning_rate": 3.457140789407441e-05, "loss": 2.6292, "step": 1481000 }, { "epoch": 3.101240915028302, "grad_norm": 13.198698043823242, "learning_rate": 3.456616363758412e-05, "loss": 2.6414, "step": 1481500 }, { "epoch": 3.1022875707539272, "grad_norm": 11.227836608886719, "learning_rate": 3.4560919381093826e-05, "loss": 2.629, "step": 1482000 }, { "epoch": 3.1033342264795527, "grad_norm": 13.408361434936523, "learning_rate": 3.455567512460354e-05, "loss": 2.6439, "step": 1482500 }, { "epoch": 3.104380882205178, "grad_norm": 12.870560646057129, "learning_rate": 3.455043086811324e-05, "loss": 2.617, "step": 1483000 }, { "epoch": 3.1054275379308036, "grad_norm": 12.472450256347656, "learning_rate": 3.454518661162295e-05, "loss": 2.6373, "step": 1483500 }, { "epoch": 3.106474193656429, "grad_norm": 13.597036361694336, "learning_rate": 3.453994235513266e-05, "loss": 2.6219, "step": 1484000 }, { "epoch": 3.1075208493820545, "grad_norm": 13.210114479064941, "learning_rate": 3.453469809864237e-05, "loss": 2.6265, "step": 1484500 }, { "epoch": 3.10856750510768, "grad_norm": 12.022546768188477, "learning_rate": 3.452945384215208e-05, "loss": 2.6501, "step": 1485000 }, { "epoch": 3.1096141608333054, "grad_norm": 14.5872802734375, "learning_rate": 3.452420958566179e-05, "loss": 2.6426, "step": 1485500 }, { "epoch": 3.110660816558931, "grad_norm": 12.993782043457031, "learning_rate": 3.4518965329171495e-05, "loss": 2.6376, "step": 1486000 }, { "epoch": 3.1117074722845564, "grad_norm": 14.362296104431152, "learning_rate": 3.45137210726812e-05, "loss": 2.6461, "step": 1486500 }, { "epoch": 3.112754128010182, "grad_norm": 15.634446144104004, "learning_rate": 3.450847681619091e-05, "loss": 2.6222, "step": 1487000 }, { "epoch": 3.1138007837358073, "grad_norm": 12.86510181427002, "learning_rate": 3.4503232559700614e-05, "loss": 2.6255, "step": 1487500 }, { "epoch": 3.1148474394614327, "grad_norm": 13.405976295471191, "learning_rate": 3.4497988303210325e-05, "loss": 2.6177, "step": 1488000 }, { "epoch": 3.115894095187058, "grad_norm": 13.061752319335938, "learning_rate": 3.449274404672003e-05, "loss": 2.6234, "step": 1488500 }, { "epoch": 3.1169407509126836, "grad_norm": 13.127534866333008, "learning_rate": 3.448749979022974e-05, "loss": 2.6295, "step": 1489000 }, { "epoch": 3.117987406638309, "grad_norm": 14.360464096069336, "learning_rate": 3.448225553373945e-05, "loss": 2.6305, "step": 1489500 }, { "epoch": 3.1190340623639345, "grad_norm": 13.032873153686523, "learning_rate": 3.447701127724916e-05, "loss": 2.6227, "step": 1490000 }, { "epoch": 3.1200807180895604, "grad_norm": 13.883295059204102, "learning_rate": 3.447176702075887e-05, "loss": 2.6369, "step": 1490500 }, { "epoch": 3.121127373815186, "grad_norm": 12.673243522644043, "learning_rate": 3.446652276426858e-05, "loss": 2.6298, "step": 1491000 }, { "epoch": 3.1221740295408114, "grad_norm": 14.432321548461914, "learning_rate": 3.446127850777828e-05, "loss": 2.6207, "step": 1491500 }, { "epoch": 3.123220685266437, "grad_norm": 14.060636520385742, "learning_rate": 3.445603425128799e-05, "loss": 2.6438, "step": 1492000 }, { "epoch": 3.1242673409920623, "grad_norm": 11.831356048583984, "learning_rate": 3.44507899947977e-05, "loss": 2.6479, "step": 1492500 }, { "epoch": 3.1253139967176877, "grad_norm": 12.04127025604248, "learning_rate": 3.44455457383074e-05, "loss": 2.6398, "step": 1493000 }, { "epoch": 3.126360652443313, "grad_norm": 14.17192268371582, "learning_rate": 3.444030148181711e-05, "loss": 2.6351, "step": 1493500 }, { "epoch": 3.1274073081689386, "grad_norm": 15.545491218566895, "learning_rate": 3.4435057225326824e-05, "loss": 2.6061, "step": 1494000 }, { "epoch": 3.128453963894564, "grad_norm": 13.718291282653809, "learning_rate": 3.4429812968836536e-05, "loss": 2.6288, "step": 1494500 }, { "epoch": 3.1295006196201895, "grad_norm": 12.574398040771484, "learning_rate": 3.442456871234624e-05, "loss": 2.6145, "step": 1495000 }, { "epoch": 3.130547275345815, "grad_norm": 15.955409049987793, "learning_rate": 3.441932445585595e-05, "loss": 2.6274, "step": 1495500 }, { "epoch": 3.1315939310714405, "grad_norm": 12.639162063598633, "learning_rate": 3.4414080199365655e-05, "loss": 2.6422, "step": 1496000 }, { "epoch": 3.132640586797066, "grad_norm": 14.843032836914062, "learning_rate": 3.4408835942875366e-05, "loss": 2.6155, "step": 1496500 }, { "epoch": 3.1336872425226914, "grad_norm": 15.747413635253906, "learning_rate": 3.440359168638507e-05, "loss": 2.6281, "step": 1497000 }, { "epoch": 3.134733898248317, "grad_norm": 14.462749481201172, "learning_rate": 3.439834742989478e-05, "loss": 2.6453, "step": 1497500 }, { "epoch": 3.1357805539739423, "grad_norm": 13.883862495422363, "learning_rate": 3.439310317340449e-05, "loss": 2.6324, "step": 1498000 }, { "epoch": 3.1368272096995677, "grad_norm": 13.23106575012207, "learning_rate": 3.43878589169142e-05, "loss": 2.6186, "step": 1498500 }, { "epoch": 3.1378738654251936, "grad_norm": 13.88731575012207, "learning_rate": 3.438261466042391e-05, "loss": 2.6347, "step": 1499000 }, { "epoch": 3.138920521150819, "grad_norm": 14.852933883666992, "learning_rate": 3.437737040393361e-05, "loss": 2.6451, "step": 1499500 }, { "epoch": 3.1399671768764446, "grad_norm": 16.911096572875977, "learning_rate": 3.4372126147443324e-05, "loss": 2.6463, "step": 1500000 }, { "epoch": 3.14101383260207, "grad_norm": 12.72054672241211, "learning_rate": 3.436688189095303e-05, "loss": 2.6143, "step": 1500500 }, { "epoch": 3.1420604883276955, "grad_norm": 14.054871559143066, "learning_rate": 3.436163763446274e-05, "loss": 2.6359, "step": 1501000 }, { "epoch": 3.143107144053321, "grad_norm": 13.99551010131836, "learning_rate": 3.435639337797244e-05, "loss": 2.6209, "step": 1501500 }, { "epoch": 3.1441537997789464, "grad_norm": 14.410571098327637, "learning_rate": 3.4351149121482154e-05, "loss": 2.6246, "step": 1502000 }, { "epoch": 3.145200455504572, "grad_norm": 13.64484691619873, "learning_rate": 3.4345904864991865e-05, "loss": 2.6352, "step": 1502500 }, { "epoch": 3.1462471112301973, "grad_norm": 13.559737205505371, "learning_rate": 3.434066060850157e-05, "loss": 2.6389, "step": 1503000 }, { "epoch": 3.1472937669558227, "grad_norm": 14.205113410949707, "learning_rate": 3.433541635201128e-05, "loss": 2.6236, "step": 1503500 }, { "epoch": 3.148340422681448, "grad_norm": 12.403366088867188, "learning_rate": 3.4330172095520985e-05, "loss": 2.615, "step": 1504000 }, { "epoch": 3.1493870784070737, "grad_norm": 16.693552017211914, "learning_rate": 3.4324927839030696e-05, "loss": 2.6102, "step": 1504500 }, { "epoch": 3.150433734132699, "grad_norm": 13.372900009155273, "learning_rate": 3.43196835825404e-05, "loss": 2.6348, "step": 1505000 }, { "epoch": 3.1514803898583246, "grad_norm": 13.73464584350586, "learning_rate": 3.431443932605011e-05, "loss": 2.6222, "step": 1505500 }, { "epoch": 3.15252704558395, "grad_norm": 13.400308609008789, "learning_rate": 3.4309195069559816e-05, "loss": 2.6407, "step": 1506000 }, { "epoch": 3.1535737013095755, "grad_norm": 12.056028366088867, "learning_rate": 3.430395081306953e-05, "loss": 2.6138, "step": 1506500 }, { "epoch": 3.154620357035201, "grad_norm": 13.69635009765625, "learning_rate": 3.429870655657924e-05, "loss": 2.6292, "step": 1507000 }, { "epoch": 3.1556670127608264, "grad_norm": 11.529740333557129, "learning_rate": 3.429346230008895e-05, "loss": 2.6134, "step": 1507500 }, { "epoch": 3.1567136684864523, "grad_norm": 13.662284851074219, "learning_rate": 3.428821804359865e-05, "loss": 2.6305, "step": 1508000 }, { "epoch": 3.1577603242120778, "grad_norm": 14.79899787902832, "learning_rate": 3.428297378710836e-05, "loss": 2.6122, "step": 1508500 }, { "epoch": 3.158806979937703, "grad_norm": 14.496898651123047, "learning_rate": 3.427772953061807e-05, "loss": 2.6373, "step": 1509000 }, { "epoch": 3.1598536356633287, "grad_norm": 12.51870059967041, "learning_rate": 3.427248527412777e-05, "loss": 2.6428, "step": 1509500 }, { "epoch": 3.160900291388954, "grad_norm": 14.424471855163574, "learning_rate": 3.4267241017637484e-05, "loss": 2.6449, "step": 1510000 }, { "epoch": 3.1619469471145796, "grad_norm": 12.257540702819824, "learning_rate": 3.426199676114719e-05, "loss": 2.6351, "step": 1510500 }, { "epoch": 3.162993602840205, "grad_norm": 12.727651596069336, "learning_rate": 3.42567525046569e-05, "loss": 2.6312, "step": 1511000 }, { "epoch": 3.1640402585658305, "grad_norm": 14.723362922668457, "learning_rate": 3.425150824816661e-05, "loss": 2.6353, "step": 1511500 }, { "epoch": 3.165086914291456, "grad_norm": 11.752311706542969, "learning_rate": 3.424626399167632e-05, "loss": 2.62, "step": 1512000 }, { "epoch": 3.1661335700170814, "grad_norm": 11.987339973449707, "learning_rate": 3.4241019735186026e-05, "loss": 2.6304, "step": 1512500 }, { "epoch": 3.167180225742707, "grad_norm": 13.644349098205566, "learning_rate": 3.423577547869574e-05, "loss": 2.5902, "step": 1513000 }, { "epoch": 3.1682268814683323, "grad_norm": 13.142813682556152, "learning_rate": 3.423053122220544e-05, "loss": 2.6325, "step": 1513500 }, { "epoch": 3.1692735371939578, "grad_norm": 16.775848388671875, "learning_rate": 3.422528696571515e-05, "loss": 2.623, "step": 1514000 }, { "epoch": 3.1703201929195832, "grad_norm": 13.653470039367676, "learning_rate": 3.422004270922486e-05, "loss": 2.6233, "step": 1514500 }, { "epoch": 3.1713668486452087, "grad_norm": 14.740654945373535, "learning_rate": 3.421479845273456e-05, "loss": 2.6313, "step": 1515000 }, { "epoch": 3.172413504370834, "grad_norm": 11.640379905700684, "learning_rate": 3.420955419624428e-05, "loss": 2.6316, "step": 1515500 }, { "epoch": 3.1734601600964596, "grad_norm": 12.115570068359375, "learning_rate": 3.420430993975398e-05, "loss": 2.6281, "step": 1516000 }, { "epoch": 3.1745068158220855, "grad_norm": 13.107054710388184, "learning_rate": 3.4199065683263694e-05, "loss": 2.6074, "step": 1516500 }, { "epoch": 3.175553471547711, "grad_norm": 15.557278633117676, "learning_rate": 3.41938214267734e-05, "loss": 2.6307, "step": 1517000 }, { "epoch": 3.1766001272733364, "grad_norm": 19.83409881591797, "learning_rate": 3.418857717028311e-05, "loss": 2.6403, "step": 1517500 }, { "epoch": 3.177646782998962, "grad_norm": 13.212686538696289, "learning_rate": 3.4183332913792814e-05, "loss": 2.6082, "step": 1518000 }, { "epoch": 3.1786934387245873, "grad_norm": 15.07593059539795, "learning_rate": 3.4178088657302525e-05, "loss": 2.6215, "step": 1518500 }, { "epoch": 3.1797400944502128, "grad_norm": 13.099095344543457, "learning_rate": 3.417284440081223e-05, "loss": 2.6253, "step": 1519000 }, { "epoch": 3.1807867501758382, "grad_norm": 12.875975608825684, "learning_rate": 3.416760014432194e-05, "loss": 2.6422, "step": 1519500 }, { "epoch": 3.1818334059014637, "grad_norm": 12.220877647399902, "learning_rate": 3.416235588783165e-05, "loss": 2.6436, "step": 1520000 }, { "epoch": 3.182880061627089, "grad_norm": 14.426410675048828, "learning_rate": 3.4157111631341356e-05, "loss": 2.6239, "step": 1520500 }, { "epoch": 3.1839267173527146, "grad_norm": 16.823484420776367, "learning_rate": 3.415186737485107e-05, "loss": 2.6237, "step": 1521000 }, { "epoch": 3.18497337307834, "grad_norm": 13.044448852539062, "learning_rate": 3.414662311836077e-05, "loss": 2.5972, "step": 1521500 }, { "epoch": 3.1860200288039655, "grad_norm": 14.419713020324707, "learning_rate": 3.414137886187048e-05, "loss": 2.622, "step": 1522000 }, { "epoch": 3.187066684529591, "grad_norm": 12.301756858825684, "learning_rate": 3.4136134605380187e-05, "loss": 2.5924, "step": 1522500 }, { "epoch": 3.1881133402552164, "grad_norm": 16.56977653503418, "learning_rate": 3.41308903488899e-05, "loss": 2.6149, "step": 1523000 }, { "epoch": 3.189159995980842, "grad_norm": 14.019543647766113, "learning_rate": 3.41256460923996e-05, "loss": 2.6331, "step": 1523500 }, { "epoch": 3.1902066517064673, "grad_norm": 14.229530334472656, "learning_rate": 3.412040183590931e-05, "loss": 2.5951, "step": 1524000 }, { "epoch": 3.191253307432093, "grad_norm": 13.297931671142578, "learning_rate": 3.4115157579419024e-05, "loss": 2.6338, "step": 1524500 }, { "epoch": 3.1922999631577182, "grad_norm": 13.278608322143555, "learning_rate": 3.4109913322928735e-05, "loss": 2.6392, "step": 1525000 }, { "epoch": 3.193346618883344, "grad_norm": 14.355673789978027, "learning_rate": 3.410466906643844e-05, "loss": 2.6309, "step": 1525500 }, { "epoch": 3.1943932746089696, "grad_norm": 13.51214599609375, "learning_rate": 3.4099424809948144e-05, "loss": 2.6338, "step": 1526000 }, { "epoch": 3.195439930334595, "grad_norm": 13.316469192504883, "learning_rate": 3.4094180553457855e-05, "loss": 2.6219, "step": 1526500 }, { "epoch": 3.1964865860602205, "grad_norm": 13.555424690246582, "learning_rate": 3.408893629696756e-05, "loss": 2.6004, "step": 1527000 }, { "epoch": 3.197533241785846, "grad_norm": 13.825494766235352, "learning_rate": 3.408369204047727e-05, "loss": 2.6301, "step": 1527500 }, { "epoch": 3.1985798975114714, "grad_norm": 12.955519676208496, "learning_rate": 3.4078447783986975e-05, "loss": 2.6248, "step": 1528000 }, { "epoch": 3.199626553237097, "grad_norm": 14.181869506835938, "learning_rate": 3.4073203527496686e-05, "loss": 2.6268, "step": 1528500 }, { "epoch": 3.2006732089627223, "grad_norm": 13.688854217529297, "learning_rate": 3.40679592710064e-05, "loss": 2.6181, "step": 1529000 }, { "epoch": 3.201719864688348, "grad_norm": 14.375650405883789, "learning_rate": 3.406271501451611e-05, "loss": 2.6112, "step": 1529500 }, { "epoch": 3.2027665204139733, "grad_norm": 12.828375816345215, "learning_rate": 3.405747075802581e-05, "loss": 2.6193, "step": 1530000 }, { "epoch": 3.2038131761395987, "grad_norm": 13.556062698364258, "learning_rate": 3.405222650153552e-05, "loss": 2.6127, "step": 1530500 }, { "epoch": 3.204859831865224, "grad_norm": 12.882162094116211, "learning_rate": 3.404698224504523e-05, "loss": 2.6362, "step": 1531000 }, { "epoch": 3.2059064875908496, "grad_norm": 13.589425086975098, "learning_rate": 3.404173798855493e-05, "loss": 2.6274, "step": 1531500 }, { "epoch": 3.206953143316475, "grad_norm": 14.799694061279297, "learning_rate": 3.403649373206464e-05, "loss": 2.6426, "step": 1532000 }, { "epoch": 3.2079997990421005, "grad_norm": 11.951351165771484, "learning_rate": 3.403124947557435e-05, "loss": 2.616, "step": 1532500 }, { "epoch": 3.209046454767726, "grad_norm": 17.00472640991211, "learning_rate": 3.4026005219084065e-05, "loss": 2.6172, "step": 1533000 }, { "epoch": 3.2100931104933514, "grad_norm": 13.808574676513672, "learning_rate": 3.402076096259377e-05, "loss": 2.6203, "step": 1533500 }, { "epoch": 3.2111397662189773, "grad_norm": 15.532210350036621, "learning_rate": 3.401551670610348e-05, "loss": 2.6317, "step": 1534000 }, { "epoch": 3.212186421944603, "grad_norm": 14.684699058532715, "learning_rate": 3.4010272449613185e-05, "loss": 2.6363, "step": 1534500 }, { "epoch": 3.2132330776702283, "grad_norm": 12.828195571899414, "learning_rate": 3.4005028193122896e-05, "loss": 2.6154, "step": 1535000 }, { "epoch": 3.2142797333958537, "grad_norm": 13.337151527404785, "learning_rate": 3.39997839366326e-05, "loss": 2.6294, "step": 1535500 }, { "epoch": 3.215326389121479, "grad_norm": 25.826656341552734, "learning_rate": 3.399453968014231e-05, "loss": 2.6348, "step": 1536000 }, { "epoch": 3.2163730448471046, "grad_norm": 12.488067626953125, "learning_rate": 3.3989295423652015e-05, "loss": 2.6311, "step": 1536500 }, { "epoch": 3.21741970057273, "grad_norm": 14.561724662780762, "learning_rate": 3.3984051167161727e-05, "loss": 2.6142, "step": 1537000 }, { "epoch": 3.2184663562983555, "grad_norm": 14.130857467651367, "learning_rate": 3.397880691067144e-05, "loss": 2.6343, "step": 1537500 }, { "epoch": 3.219513012023981, "grad_norm": 12.78018569946289, "learning_rate": 3.397356265418114e-05, "loss": 2.6428, "step": 1538000 }, { "epoch": 3.2205596677496064, "grad_norm": 12.859517097473145, "learning_rate": 3.396831839769085e-05, "loss": 2.6289, "step": 1538500 }, { "epoch": 3.221606323475232, "grad_norm": 13.299440383911133, "learning_rate": 3.396307414120056e-05, "loss": 2.6327, "step": 1539000 }, { "epoch": 3.2226529792008574, "grad_norm": 13.640810012817383, "learning_rate": 3.395782988471027e-05, "loss": 2.6249, "step": 1539500 }, { "epoch": 3.223699634926483, "grad_norm": 13.043303489685059, "learning_rate": 3.395258562821997e-05, "loss": 2.609, "step": 1540000 }, { "epoch": 3.2247462906521083, "grad_norm": 13.4417085647583, "learning_rate": 3.3947341371729684e-05, "loss": 2.6103, "step": 1540500 }, { "epoch": 3.2257929463777337, "grad_norm": 14.322108268737793, "learning_rate": 3.394209711523939e-05, "loss": 2.6112, "step": 1541000 }, { "epoch": 3.226839602103359, "grad_norm": 12.940901756286621, "learning_rate": 3.39368528587491e-05, "loss": 2.6244, "step": 1541500 }, { "epoch": 3.2278862578289846, "grad_norm": 13.4194917678833, "learning_rate": 3.393160860225881e-05, "loss": 2.6027, "step": 1542000 }, { "epoch": 3.22893291355461, "grad_norm": 11.352532386779785, "learning_rate": 3.3926364345768515e-05, "loss": 2.6193, "step": 1542500 }, { "epoch": 3.229979569280236, "grad_norm": 15.22634506225586, "learning_rate": 3.3921120089278226e-05, "loss": 2.6113, "step": 1543000 }, { "epoch": 3.2310262250058615, "grad_norm": 12.387452125549316, "learning_rate": 3.391587583278793e-05, "loss": 2.6478, "step": 1543500 }, { "epoch": 3.232072880731487, "grad_norm": 14.50820255279541, "learning_rate": 3.391063157629764e-05, "loss": 2.6323, "step": 1544000 }, { "epoch": 3.2331195364571124, "grad_norm": 14.171832084655762, "learning_rate": 3.3905387319807345e-05, "loss": 2.5992, "step": 1544500 }, { "epoch": 3.234166192182738, "grad_norm": 12.474421501159668, "learning_rate": 3.3900143063317056e-05, "loss": 2.6319, "step": 1545000 }, { "epoch": 3.2352128479083633, "grad_norm": 11.766410827636719, "learning_rate": 3.389489880682676e-05, "loss": 2.6348, "step": 1545500 }, { "epoch": 3.2362595036339887, "grad_norm": 12.489192962646484, "learning_rate": 3.388965455033647e-05, "loss": 2.6124, "step": 1546000 }, { "epoch": 3.237306159359614, "grad_norm": 13.880818367004395, "learning_rate": 3.388441029384618e-05, "loss": 2.6253, "step": 1546500 }, { "epoch": 3.2383528150852396, "grad_norm": 13.076128959655762, "learning_rate": 3.3879166037355894e-05, "loss": 2.6394, "step": 1547000 }, { "epoch": 3.239399470810865, "grad_norm": 13.71595573425293, "learning_rate": 3.38739217808656e-05, "loss": 2.6219, "step": 1547500 }, { "epoch": 3.2404461265364906, "grad_norm": 14.579802513122559, "learning_rate": 3.386867752437531e-05, "loss": 2.6292, "step": 1548000 }, { "epoch": 3.241492782262116, "grad_norm": 12.303620338439941, "learning_rate": 3.3863433267885014e-05, "loss": 2.6174, "step": 1548500 }, { "epoch": 3.2425394379877415, "grad_norm": 11.30073356628418, "learning_rate": 3.385818901139472e-05, "loss": 2.6371, "step": 1549000 }, { "epoch": 3.243586093713367, "grad_norm": 13.910319328308105, "learning_rate": 3.385294475490443e-05, "loss": 2.5942, "step": 1549500 }, { "epoch": 3.2446327494389924, "grad_norm": 15.200575828552246, "learning_rate": 3.384770049841413e-05, "loss": 2.6114, "step": 1550000 }, { "epoch": 3.245679405164618, "grad_norm": 15.903779983520508, "learning_rate": 3.384245624192385e-05, "loss": 2.6352, "step": 1550500 }, { "epoch": 3.2467260608902433, "grad_norm": 13.828716278076172, "learning_rate": 3.3837211985433555e-05, "loss": 2.599, "step": 1551000 }, { "epoch": 3.247772716615869, "grad_norm": 18.00935173034668, "learning_rate": 3.3831967728943267e-05, "loss": 2.6182, "step": 1551500 }, { "epoch": 3.2488193723414946, "grad_norm": 16.156829833984375, "learning_rate": 3.382672347245297e-05, "loss": 2.628, "step": 1552000 }, { "epoch": 3.24986602806712, "grad_norm": 15.752429008483887, "learning_rate": 3.382147921596268e-05, "loss": 2.6313, "step": 1552500 }, { "epoch": 3.2509126837927456, "grad_norm": 13.050212860107422, "learning_rate": 3.3816234959472386e-05, "loss": 2.6076, "step": 1553000 }, { "epoch": 3.251959339518371, "grad_norm": 13.40114974975586, "learning_rate": 3.38109907029821e-05, "loss": 2.6347, "step": 1553500 }, { "epoch": 3.2530059952439965, "grad_norm": 12.161314964294434, "learning_rate": 3.38057464464918e-05, "loss": 2.6286, "step": 1554000 }, { "epoch": 3.254052650969622, "grad_norm": 13.111798286437988, "learning_rate": 3.380050219000151e-05, "loss": 2.6035, "step": 1554500 }, { "epoch": 3.2550993066952474, "grad_norm": 17.632509231567383, "learning_rate": 3.3795257933511224e-05, "loss": 2.605, "step": 1555000 }, { "epoch": 3.256145962420873, "grad_norm": 12.048283576965332, "learning_rate": 3.379001367702093e-05, "loss": 2.611, "step": 1555500 }, { "epoch": 3.2571926181464983, "grad_norm": 15.224591255187988, "learning_rate": 3.378476942053064e-05, "loss": 2.6271, "step": 1556000 }, { "epoch": 3.2582392738721238, "grad_norm": 15.04339599609375, "learning_rate": 3.3779525164040343e-05, "loss": 2.6299, "step": 1556500 }, { "epoch": 3.259285929597749, "grad_norm": 13.818249702453613, "learning_rate": 3.3774280907550055e-05, "loss": 2.6292, "step": 1557000 }, { "epoch": 3.2603325853233747, "grad_norm": 11.619976043701172, "learning_rate": 3.376903665105976e-05, "loss": 2.6321, "step": 1557500 }, { "epoch": 3.261379241049, "grad_norm": 15.256277084350586, "learning_rate": 3.376379239456947e-05, "loss": 2.641, "step": 1558000 }, { "epoch": 3.2624258967746256, "grad_norm": 14.517608642578125, "learning_rate": 3.3758548138079174e-05, "loss": 2.6101, "step": 1558500 }, { "epoch": 3.263472552500251, "grad_norm": 15.926023483276367, "learning_rate": 3.3753303881588885e-05, "loss": 2.626, "step": 1559000 }, { "epoch": 3.2645192082258765, "grad_norm": 13.105973243713379, "learning_rate": 3.3748059625098596e-05, "loss": 2.6225, "step": 1559500 }, { "epoch": 3.265565863951502, "grad_norm": 13.26851749420166, "learning_rate": 3.37428153686083e-05, "loss": 2.6365, "step": 1560000 }, { "epoch": 3.2666125196771274, "grad_norm": 13.599233627319336, "learning_rate": 3.373757111211801e-05, "loss": 2.6291, "step": 1560500 }, { "epoch": 3.2676591754027533, "grad_norm": 13.285042762756348, "learning_rate": 3.3732326855627716e-05, "loss": 2.5996, "step": 1561000 }, { "epoch": 3.2687058311283788, "grad_norm": 13.077230453491211, "learning_rate": 3.372708259913743e-05, "loss": 2.6356, "step": 1561500 }, { "epoch": 3.269752486854004, "grad_norm": 14.555198669433594, "learning_rate": 3.372183834264713e-05, "loss": 2.6277, "step": 1562000 }, { "epoch": 3.2707991425796297, "grad_norm": 15.093230247497559, "learning_rate": 3.371659408615684e-05, "loss": 2.6231, "step": 1562500 }, { "epoch": 3.271845798305255, "grad_norm": 13.861417770385742, "learning_rate": 3.371134982966655e-05, "loss": 2.6344, "step": 1563000 }, { "epoch": 3.2728924540308806, "grad_norm": 13.813948631286621, "learning_rate": 3.370610557317626e-05, "loss": 2.6241, "step": 1563500 }, { "epoch": 3.273939109756506, "grad_norm": 16.926069259643555, "learning_rate": 3.370086131668597e-05, "loss": 2.6124, "step": 1564000 }, { "epoch": 3.2749857654821315, "grad_norm": 15.063858032226562, "learning_rate": 3.369561706019568e-05, "loss": 2.6168, "step": 1564500 }, { "epoch": 3.276032421207757, "grad_norm": 13.794876098632812, "learning_rate": 3.3690372803705384e-05, "loss": 2.6237, "step": 1565000 }, { "epoch": 3.2770790769333824, "grad_norm": 15.703207969665527, "learning_rate": 3.368512854721509e-05, "loss": 2.5818, "step": 1565500 }, { "epoch": 3.278125732659008, "grad_norm": 15.211177825927734, "learning_rate": 3.36798842907248e-05, "loss": 2.6218, "step": 1566000 }, { "epoch": 3.2791723883846333, "grad_norm": 12.86640739440918, "learning_rate": 3.3674640034234504e-05, "loss": 2.6119, "step": 1566500 }, { "epoch": 3.2802190441102588, "grad_norm": 13.752448081970215, "learning_rate": 3.3669395777744215e-05, "loss": 2.6192, "step": 1567000 }, { "epoch": 3.2812656998358842, "grad_norm": 12.996736526489258, "learning_rate": 3.366415152125392e-05, "loss": 2.6251, "step": 1567500 }, { "epoch": 3.2823123555615097, "grad_norm": 11.689282417297363, "learning_rate": 3.365890726476363e-05, "loss": 2.6091, "step": 1568000 }, { "epoch": 3.2833590112871356, "grad_norm": 12.317570686340332, "learning_rate": 3.365366300827334e-05, "loss": 2.6189, "step": 1568500 }, { "epoch": 3.284405667012761, "grad_norm": 13.523837089538574, "learning_rate": 3.364841875178305e-05, "loss": 2.658, "step": 1569000 }, { "epoch": 3.2854523227383865, "grad_norm": 14.832544326782227, "learning_rate": 3.364317449529276e-05, "loss": 2.6213, "step": 1569500 }, { "epoch": 3.286498978464012, "grad_norm": 15.497859001159668, "learning_rate": 3.363793023880247e-05, "loss": 2.6095, "step": 1570000 }, { "epoch": 3.2875456341896374, "grad_norm": 13.847580909729004, "learning_rate": 3.363268598231217e-05, "loss": 2.6391, "step": 1570500 }, { "epoch": 3.288592289915263, "grad_norm": 13.343616485595703, "learning_rate": 3.362744172582188e-05, "loss": 2.6201, "step": 1571000 }, { "epoch": 3.2896389456408883, "grad_norm": 15.003288269042969, "learning_rate": 3.362219746933159e-05, "loss": 2.6262, "step": 1571500 }, { "epoch": 3.290685601366514, "grad_norm": 12.397673606872559, "learning_rate": 3.36169532128413e-05, "loss": 2.6162, "step": 1572000 }, { "epoch": 3.2917322570921392, "grad_norm": 14.948373794555664, "learning_rate": 3.361170895635101e-05, "loss": 2.6033, "step": 1572500 }, { "epoch": 3.2927789128177647, "grad_norm": 15.985811233520508, "learning_rate": 3.3606464699860714e-05, "loss": 2.6091, "step": 1573000 }, { "epoch": 3.29382556854339, "grad_norm": 11.91251277923584, "learning_rate": 3.3601220443370425e-05, "loss": 2.6316, "step": 1573500 }, { "epoch": 3.2948722242690156, "grad_norm": 12.624189376831055, "learning_rate": 3.359597618688013e-05, "loss": 2.6304, "step": 1574000 }, { "epoch": 3.295918879994641, "grad_norm": 15.372150421142578, "learning_rate": 3.359073193038984e-05, "loss": 2.5957, "step": 1574500 }, { "epoch": 3.2969655357202665, "grad_norm": 12.403984069824219, "learning_rate": 3.3585487673899545e-05, "loss": 2.6146, "step": 1575000 }, { "epoch": 3.298012191445892, "grad_norm": 11.888715744018555, "learning_rate": 3.3580243417409256e-05, "loss": 2.6183, "step": 1575500 }, { "epoch": 3.2990588471715174, "grad_norm": 16.22939109802246, "learning_rate": 3.357499916091896e-05, "loss": 2.6141, "step": 1576000 }, { "epoch": 3.300105502897143, "grad_norm": 14.827712059020996, "learning_rate": 3.356975490442867e-05, "loss": 2.6323, "step": 1576500 }, { "epoch": 3.3011521586227683, "grad_norm": 14.012068748474121, "learning_rate": 3.356451064793838e-05, "loss": 2.6168, "step": 1577000 }, { "epoch": 3.302198814348394, "grad_norm": 13.738649368286133, "learning_rate": 3.355926639144809e-05, "loss": 2.6005, "step": 1577500 }, { "epoch": 3.3032454700740193, "grad_norm": 12.111625671386719, "learning_rate": 3.35540221349578e-05, "loss": 2.6206, "step": 1578000 }, { "epoch": 3.304292125799645, "grad_norm": 11.666239738464355, "learning_rate": 3.35487778784675e-05, "loss": 2.6191, "step": 1578500 }, { "epoch": 3.3053387815252706, "grad_norm": 13.649118423461914, "learning_rate": 3.354353362197721e-05, "loss": 2.6228, "step": 1579000 }, { "epoch": 3.306385437250896, "grad_norm": 12.087136268615723, "learning_rate": 3.353828936548692e-05, "loss": 2.6229, "step": 1579500 }, { "epoch": 3.3074320929765215, "grad_norm": 13.535514831542969, "learning_rate": 3.353304510899663e-05, "loss": 2.6229, "step": 1580000 }, { "epoch": 3.308478748702147, "grad_norm": 13.578423500061035, "learning_rate": 3.352780085250633e-05, "loss": 2.6218, "step": 1580500 }, { "epoch": 3.3095254044277724, "grad_norm": 15.178694725036621, "learning_rate": 3.3522556596016044e-05, "loss": 2.6277, "step": 1581000 }, { "epoch": 3.310572060153398, "grad_norm": 14.607842445373535, "learning_rate": 3.3517312339525755e-05, "loss": 2.6216, "step": 1581500 }, { "epoch": 3.3116187158790233, "grad_norm": 12.838603973388672, "learning_rate": 3.3512068083035466e-05, "loss": 2.6362, "step": 1582000 }, { "epoch": 3.312665371604649, "grad_norm": 15.440075874328613, "learning_rate": 3.350682382654517e-05, "loss": 2.6138, "step": 1582500 }, { "epoch": 3.3137120273302743, "grad_norm": 15.073751449584961, "learning_rate": 3.3501579570054875e-05, "loss": 2.6143, "step": 1583000 }, { "epoch": 3.3147586830558997, "grad_norm": 13.693927764892578, "learning_rate": 3.3496335313564586e-05, "loss": 2.6101, "step": 1583500 }, { "epoch": 3.315805338781525, "grad_norm": 13.1008882522583, "learning_rate": 3.349109105707429e-05, "loss": 2.6123, "step": 1584000 }, { "epoch": 3.3168519945071506, "grad_norm": 12.765411376953125, "learning_rate": 3.3485846800584e-05, "loss": 2.6023, "step": 1584500 }, { "epoch": 3.317898650232776, "grad_norm": 13.082297325134277, "learning_rate": 3.3480602544093706e-05, "loss": 2.6253, "step": 1585000 }, { "epoch": 3.3189453059584015, "grad_norm": 13.753101348876953, "learning_rate": 3.347535828760342e-05, "loss": 2.6242, "step": 1585500 }, { "epoch": 3.3199919616840274, "grad_norm": 12.020339012145996, "learning_rate": 3.347011403111313e-05, "loss": 2.6155, "step": 1586000 }, { "epoch": 3.321038617409653, "grad_norm": 13.345730781555176, "learning_rate": 3.346486977462284e-05, "loss": 2.5924, "step": 1586500 }, { "epoch": 3.3220852731352783, "grad_norm": 12.144383430480957, "learning_rate": 3.345962551813254e-05, "loss": 2.6171, "step": 1587000 }, { "epoch": 3.323131928860904, "grad_norm": 15.135580062866211, "learning_rate": 3.3454381261642254e-05, "loss": 2.6194, "step": 1587500 }, { "epoch": 3.3241785845865293, "grad_norm": 15.087864875793457, "learning_rate": 3.344913700515196e-05, "loss": 2.6131, "step": 1588000 }, { "epoch": 3.3252252403121547, "grad_norm": 13.164562225341797, "learning_rate": 3.344389274866166e-05, "loss": 2.6073, "step": 1588500 }, { "epoch": 3.32627189603778, "grad_norm": 14.353514671325684, "learning_rate": 3.3438648492171374e-05, "loss": 2.6219, "step": 1589000 }, { "epoch": 3.3273185517634056, "grad_norm": 14.821812629699707, "learning_rate": 3.3433404235681085e-05, "loss": 2.618, "step": 1589500 }, { "epoch": 3.328365207489031, "grad_norm": 13.675481796264648, "learning_rate": 3.3428159979190796e-05, "loss": 2.6344, "step": 1590000 }, { "epoch": 3.3294118632146565, "grad_norm": 13.505209922790527, "learning_rate": 3.34229157227005e-05, "loss": 2.6128, "step": 1590500 }, { "epoch": 3.330458518940282, "grad_norm": 13.53839111328125, "learning_rate": 3.341767146621021e-05, "loss": 2.6177, "step": 1591000 }, { "epoch": 3.3315051746659075, "grad_norm": 13.823898315429688, "learning_rate": 3.3412427209719916e-05, "loss": 2.6187, "step": 1591500 }, { "epoch": 3.332551830391533, "grad_norm": 12.663900375366211, "learning_rate": 3.340718295322963e-05, "loss": 2.6114, "step": 1592000 }, { "epoch": 3.3335984861171584, "grad_norm": 16.97340202331543, "learning_rate": 3.340193869673933e-05, "loss": 2.62, "step": 1592500 }, { "epoch": 3.334645141842784, "grad_norm": 12.063414573669434, "learning_rate": 3.339669444024904e-05, "loss": 2.6137, "step": 1593000 }, { "epoch": 3.3356917975684093, "grad_norm": 13.969576835632324, "learning_rate": 3.3391450183758746e-05, "loss": 2.618, "step": 1593500 }, { "epoch": 3.3367384532940347, "grad_norm": 12.744583129882812, "learning_rate": 3.338620592726846e-05, "loss": 2.6066, "step": 1594000 }, { "epoch": 3.33778510901966, "grad_norm": 13.970172882080078, "learning_rate": 3.338096167077817e-05, "loss": 2.6167, "step": 1594500 }, { "epoch": 3.3388317647452856, "grad_norm": 14.116438865661621, "learning_rate": 3.337571741428787e-05, "loss": 2.6176, "step": 1595000 }, { "epoch": 3.339878420470911, "grad_norm": 14.192028045654297, "learning_rate": 3.3370473157797584e-05, "loss": 2.6205, "step": 1595500 }, { "epoch": 3.340925076196537, "grad_norm": 13.76386833190918, "learning_rate": 3.336522890130729e-05, "loss": 2.6393, "step": 1596000 }, { "epoch": 3.3419717319221625, "grad_norm": 12.906645774841309, "learning_rate": 3.3359984644817e-05, "loss": 2.6158, "step": 1596500 }, { "epoch": 3.343018387647788, "grad_norm": 13.679375648498535, "learning_rate": 3.3354740388326704e-05, "loss": 2.6056, "step": 1597000 }, { "epoch": 3.3440650433734134, "grad_norm": 14.397501945495605, "learning_rate": 3.3349496131836415e-05, "loss": 2.6098, "step": 1597500 }, { "epoch": 3.345111699099039, "grad_norm": 11.946187019348145, "learning_rate": 3.334425187534612e-05, "loss": 2.6119, "step": 1598000 }, { "epoch": 3.3461583548246643, "grad_norm": 18.061899185180664, "learning_rate": 3.333900761885583e-05, "loss": 2.6097, "step": 1598500 }, { "epoch": 3.3472050105502897, "grad_norm": 12.881669044494629, "learning_rate": 3.333376336236554e-05, "loss": 2.621, "step": 1599000 }, { "epoch": 3.348251666275915, "grad_norm": 12.085357666015625, "learning_rate": 3.3328519105875246e-05, "loss": 2.606, "step": 1599500 }, { "epoch": 3.3492983220015407, "grad_norm": 12.890158653259277, "learning_rate": 3.332327484938496e-05, "loss": 2.6229, "step": 1600000 }, { "epoch": 3.350344977727166, "grad_norm": 13.625066757202148, "learning_rate": 3.331803059289466e-05, "loss": 2.631, "step": 1600500 }, { "epoch": 3.3513916334527916, "grad_norm": 14.466872215270996, "learning_rate": 3.331278633640437e-05, "loss": 2.5972, "step": 1601000 }, { "epoch": 3.352438289178417, "grad_norm": 13.1347017288208, "learning_rate": 3.3307542079914076e-05, "loss": 2.6207, "step": 1601500 }, { "epoch": 3.3534849449040425, "grad_norm": 16.05615234375, "learning_rate": 3.330229782342379e-05, "loss": 2.6138, "step": 1602000 }, { "epoch": 3.354531600629668, "grad_norm": 16.713909149169922, "learning_rate": 3.329705356693349e-05, "loss": 2.614, "step": 1602500 }, { "epoch": 3.3555782563552934, "grad_norm": 12.926460266113281, "learning_rate": 3.32918093104432e-05, "loss": 2.6137, "step": 1603000 }, { "epoch": 3.3566249120809193, "grad_norm": 13.5143404006958, "learning_rate": 3.3286565053952914e-05, "loss": 2.6226, "step": 1603500 }, { "epoch": 3.3576715678065447, "grad_norm": 13.596628189086914, "learning_rate": 3.3281320797462625e-05, "loss": 2.5945, "step": 1604000 }, { "epoch": 3.35871822353217, "grad_norm": 12.945680618286133, "learning_rate": 3.327607654097233e-05, "loss": 2.6303, "step": 1604500 }, { "epoch": 3.3597648792577957, "grad_norm": 28.871109008789062, "learning_rate": 3.3270832284482034e-05, "loss": 2.6115, "step": 1605000 }, { "epoch": 3.360811534983421, "grad_norm": 12.930505752563477, "learning_rate": 3.3265588027991745e-05, "loss": 2.615, "step": 1605500 }, { "epoch": 3.3618581907090466, "grad_norm": 14.027061462402344, "learning_rate": 3.326034377150145e-05, "loss": 2.6157, "step": 1606000 }, { "epoch": 3.362904846434672, "grad_norm": 13.415884971618652, "learning_rate": 3.325509951501116e-05, "loss": 2.6061, "step": 1606500 }, { "epoch": 3.3639515021602975, "grad_norm": 16.869516372680664, "learning_rate": 3.324985525852087e-05, "loss": 2.6116, "step": 1607000 }, { "epoch": 3.364998157885923, "grad_norm": 12.850579261779785, "learning_rate": 3.324461100203058e-05, "loss": 2.603, "step": 1607500 }, { "epoch": 3.3660448136115484, "grad_norm": 11.954508781433105, "learning_rate": 3.3239366745540286e-05, "loss": 2.5906, "step": 1608000 }, { "epoch": 3.367091469337174, "grad_norm": 14.10905647277832, "learning_rate": 3.323412248905e-05, "loss": 2.6234, "step": 1608500 }, { "epoch": 3.3681381250627993, "grad_norm": 13.855284690856934, "learning_rate": 3.32288782325597e-05, "loss": 2.6139, "step": 1609000 }, { "epoch": 3.3691847807884248, "grad_norm": 14.55602741241455, "learning_rate": 3.322363397606941e-05, "loss": 2.5953, "step": 1609500 }, { "epoch": 3.37023143651405, "grad_norm": 15.215158462524414, "learning_rate": 3.321838971957912e-05, "loss": 2.6101, "step": 1610000 }, { "epoch": 3.3712780922396757, "grad_norm": 16.381855010986328, "learning_rate": 3.321314546308883e-05, "loss": 2.6092, "step": 1610500 }, { "epoch": 3.372324747965301, "grad_norm": 14.513357162475586, "learning_rate": 3.320790120659853e-05, "loss": 2.6245, "step": 1611000 }, { "epoch": 3.3733714036909266, "grad_norm": 14.390244483947754, "learning_rate": 3.3202656950108244e-05, "loss": 2.6168, "step": 1611500 }, { "epoch": 3.374418059416552, "grad_norm": 13.735180854797363, "learning_rate": 3.3197412693617955e-05, "loss": 2.614, "step": 1612000 }, { "epoch": 3.3754647151421775, "grad_norm": 12.453423500061035, "learning_rate": 3.319216843712766e-05, "loss": 2.6046, "step": 1612500 }, { "epoch": 3.376511370867803, "grad_norm": 13.116125106811523, "learning_rate": 3.318692418063737e-05, "loss": 2.5933, "step": 1613000 }, { "epoch": 3.377558026593429, "grad_norm": 14.534039497375488, "learning_rate": 3.3181679924147074e-05, "loss": 2.5946, "step": 1613500 }, { "epoch": 3.3786046823190543, "grad_norm": 11.905715942382812, "learning_rate": 3.3176435667656786e-05, "loss": 2.6098, "step": 1614000 }, { "epoch": 3.3796513380446798, "grad_norm": 12.036633491516113, "learning_rate": 3.317119141116649e-05, "loss": 2.6122, "step": 1614500 }, { "epoch": 3.380697993770305, "grad_norm": 14.773463249206543, "learning_rate": 3.31659471546762e-05, "loss": 2.6176, "step": 1615000 }, { "epoch": 3.3817446494959307, "grad_norm": 14.723955154418945, "learning_rate": 3.3160702898185905e-05, "loss": 2.6143, "step": 1615500 }, { "epoch": 3.382791305221556, "grad_norm": 13.309017181396484, "learning_rate": 3.3155458641695616e-05, "loss": 2.6108, "step": 1616000 }, { "epoch": 3.3838379609471816, "grad_norm": 12.637740135192871, "learning_rate": 3.315021438520533e-05, "loss": 2.5858, "step": 1616500 }, { "epoch": 3.384884616672807, "grad_norm": 13.823939323425293, "learning_rate": 3.314497012871503e-05, "loss": 2.6225, "step": 1617000 }, { "epoch": 3.3859312723984325, "grad_norm": 12.740200996398926, "learning_rate": 3.313972587222474e-05, "loss": 2.6111, "step": 1617500 }, { "epoch": 3.386977928124058, "grad_norm": 14.999689102172852, "learning_rate": 3.313448161573445e-05, "loss": 2.5833, "step": 1618000 }, { "epoch": 3.3880245838496834, "grad_norm": 14.730511665344238, "learning_rate": 3.312923735924416e-05, "loss": 2.6145, "step": 1618500 }, { "epoch": 3.389071239575309, "grad_norm": 14.49223518371582, "learning_rate": 3.312399310275386e-05, "loss": 2.5996, "step": 1619000 }, { "epoch": 3.3901178953009343, "grad_norm": 16.091060638427734, "learning_rate": 3.3118748846263574e-05, "loss": 2.6168, "step": 1619500 }, { "epoch": 3.39116455102656, "grad_norm": 12.687015533447266, "learning_rate": 3.311350458977328e-05, "loss": 2.599, "step": 1620000 }, { "epoch": 3.3922112067521852, "grad_norm": 14.631107330322266, "learning_rate": 3.310826033328299e-05, "loss": 2.5997, "step": 1620500 }, { "epoch": 3.393257862477811, "grad_norm": 12.873712539672852, "learning_rate": 3.31030160767927e-05, "loss": 2.6148, "step": 1621000 }, { "epoch": 3.3943045182034366, "grad_norm": 13.644495964050293, "learning_rate": 3.309777182030241e-05, "loss": 2.6177, "step": 1621500 }, { "epoch": 3.395351173929062, "grad_norm": 16.20016860961914, "learning_rate": 3.3092527563812115e-05, "loss": 2.6138, "step": 1622000 }, { "epoch": 3.3963978296546875, "grad_norm": 13.884345054626465, "learning_rate": 3.308728330732182e-05, "loss": 2.599, "step": 1622500 }, { "epoch": 3.397444485380313, "grad_norm": 12.372699737548828, "learning_rate": 3.308203905083153e-05, "loss": 2.6141, "step": 1623000 }, { "epoch": 3.3984911411059384, "grad_norm": 13.4647855758667, "learning_rate": 3.3076794794341235e-05, "loss": 2.6092, "step": 1623500 }, { "epoch": 3.399537796831564, "grad_norm": 13.74511432647705, "learning_rate": 3.3071550537850946e-05, "loss": 2.6164, "step": 1624000 }, { "epoch": 3.4005844525571893, "grad_norm": 12.101808547973633, "learning_rate": 3.306630628136066e-05, "loss": 2.5998, "step": 1624500 }, { "epoch": 3.401631108282815, "grad_norm": 18.401456832885742, "learning_rate": 3.306106202487037e-05, "loss": 2.6272, "step": 1625000 }, { "epoch": 3.4026777640084402, "grad_norm": 13.980360984802246, "learning_rate": 3.305581776838007e-05, "loss": 2.6104, "step": 1625500 }, { "epoch": 3.4037244197340657, "grad_norm": 13.911930084228516, "learning_rate": 3.3050573511889784e-05, "loss": 2.597, "step": 1626000 }, { "epoch": 3.404771075459691, "grad_norm": 11.517435073852539, "learning_rate": 3.304532925539949e-05, "loss": 2.6238, "step": 1626500 }, { "epoch": 3.4058177311853166, "grad_norm": 12.70619010925293, "learning_rate": 3.30400849989092e-05, "loss": 2.608, "step": 1627000 }, { "epoch": 3.406864386910942, "grad_norm": 13.39412784576416, "learning_rate": 3.30348407424189e-05, "loss": 2.5979, "step": 1627500 }, { "epoch": 3.4079110426365675, "grad_norm": 13.991933822631836, "learning_rate": 3.302959648592861e-05, "loss": 2.6086, "step": 1628000 }, { "epoch": 3.408957698362193, "grad_norm": 14.408665657043457, "learning_rate": 3.302435222943832e-05, "loss": 2.5987, "step": 1628500 }, { "epoch": 3.4100043540878184, "grad_norm": 15.97365951538086, "learning_rate": 3.301910797294803e-05, "loss": 2.6102, "step": 1629000 }, { "epoch": 3.411051009813444, "grad_norm": 11.642365455627441, "learning_rate": 3.301386371645774e-05, "loss": 2.5869, "step": 1629500 }, { "epoch": 3.4120976655390693, "grad_norm": 13.51561164855957, "learning_rate": 3.3008619459967445e-05, "loss": 2.6076, "step": 1630000 }, { "epoch": 3.413144321264695, "grad_norm": 13.810663223266602, "learning_rate": 3.3003375203477156e-05, "loss": 2.6329, "step": 1630500 }, { "epoch": 3.4141909769903207, "grad_norm": 13.605813980102539, "learning_rate": 3.299813094698686e-05, "loss": 2.6032, "step": 1631000 }, { "epoch": 3.415237632715946, "grad_norm": 14.518292427062988, "learning_rate": 3.299288669049657e-05, "loss": 2.6174, "step": 1631500 }, { "epoch": 3.4162842884415716, "grad_norm": 11.163458824157715, "learning_rate": 3.2987642434006276e-05, "loss": 2.6327, "step": 1632000 }, { "epoch": 3.417330944167197, "grad_norm": 13.703085899353027, "learning_rate": 3.298239817751599e-05, "loss": 2.6082, "step": 1632500 }, { "epoch": 3.4183775998928225, "grad_norm": 11.406394004821777, "learning_rate": 3.297715392102569e-05, "loss": 2.6065, "step": 1633000 }, { "epoch": 3.419424255618448, "grad_norm": 13.184581756591797, "learning_rate": 3.29719096645354e-05, "loss": 2.602, "step": 1633500 }, { "epoch": 3.4204709113440734, "grad_norm": 13.470970153808594, "learning_rate": 3.2966665408045114e-05, "loss": 2.5916, "step": 1634000 }, { "epoch": 3.421517567069699, "grad_norm": 13.603510856628418, "learning_rate": 3.296142115155482e-05, "loss": 2.603, "step": 1634500 }, { "epoch": 3.4225642227953244, "grad_norm": 14.658100128173828, "learning_rate": 3.295617689506453e-05, "loss": 2.6218, "step": 1635000 }, { "epoch": 3.42361087852095, "grad_norm": 15.017570495605469, "learning_rate": 3.295093263857423e-05, "loss": 2.6135, "step": 1635500 }, { "epoch": 3.4246575342465753, "grad_norm": 14.653863906860352, "learning_rate": 3.2945688382083944e-05, "loss": 2.576, "step": 1636000 }, { "epoch": 3.4257041899722007, "grad_norm": 13.194838523864746, "learning_rate": 3.294044412559365e-05, "loss": 2.62, "step": 1636500 }, { "epoch": 3.426750845697826, "grad_norm": 14.565766334533691, "learning_rate": 3.293519986910336e-05, "loss": 2.6203, "step": 1637000 }, { "epoch": 3.4277975014234516, "grad_norm": 17.574304580688477, "learning_rate": 3.2929955612613064e-05, "loss": 2.6028, "step": 1637500 }, { "epoch": 3.428844157149077, "grad_norm": 11.37977409362793, "learning_rate": 3.2924711356122775e-05, "loss": 2.5994, "step": 1638000 }, { "epoch": 3.429890812874703, "grad_norm": 16.632450103759766, "learning_rate": 3.2919467099632486e-05, "loss": 2.6005, "step": 1638500 }, { "epoch": 3.4309374686003284, "grad_norm": 13.19444751739502, "learning_rate": 3.291422284314219e-05, "loss": 2.6025, "step": 1639000 }, { "epoch": 3.431984124325954, "grad_norm": 15.031281471252441, "learning_rate": 3.29089785866519e-05, "loss": 2.6222, "step": 1639500 }, { "epoch": 3.4330307800515794, "grad_norm": 12.118049621582031, "learning_rate": 3.2903734330161606e-05, "loss": 2.603, "step": 1640000 }, { "epoch": 3.434077435777205, "grad_norm": 14.9058837890625, "learning_rate": 3.289849007367132e-05, "loss": 2.6017, "step": 1640500 }, { "epoch": 3.4351240915028303, "grad_norm": 14.445300102233887, "learning_rate": 3.289324581718102e-05, "loss": 2.597, "step": 1641000 }, { "epoch": 3.4361707472284557, "grad_norm": 13.259748458862305, "learning_rate": 3.288800156069073e-05, "loss": 2.6009, "step": 1641500 }, { "epoch": 3.437217402954081, "grad_norm": 13.449533462524414, "learning_rate": 3.288275730420044e-05, "loss": 2.6052, "step": 1642000 }, { "epoch": 3.4382640586797066, "grad_norm": 12.879916191101074, "learning_rate": 3.2877513047710154e-05, "loss": 2.611, "step": 1642500 }, { "epoch": 3.439310714405332, "grad_norm": 13.35605525970459, "learning_rate": 3.287226879121986e-05, "loss": 2.6148, "step": 1643000 }, { "epoch": 3.4403573701309575, "grad_norm": 15.964502334594727, "learning_rate": 3.286702453472957e-05, "loss": 2.6174, "step": 1643500 }, { "epoch": 3.441404025856583, "grad_norm": 14.999670028686523, "learning_rate": 3.2861780278239274e-05, "loss": 2.6189, "step": 1644000 }, { "epoch": 3.4424506815822085, "grad_norm": 15.030144691467285, "learning_rate": 3.2856536021748985e-05, "loss": 2.6138, "step": 1644500 }, { "epoch": 3.443497337307834, "grad_norm": 14.161288261413574, "learning_rate": 3.285129176525869e-05, "loss": 2.6138, "step": 1645000 }, { "epoch": 3.4445439930334594, "grad_norm": 14.5087251663208, "learning_rate": 3.2846047508768394e-05, "loss": 2.6282, "step": 1645500 }, { "epoch": 3.445590648759085, "grad_norm": 13.448596000671387, "learning_rate": 3.2840803252278105e-05, "loss": 2.6064, "step": 1646000 }, { "epoch": 3.4466373044847103, "grad_norm": 11.900554656982422, "learning_rate": 3.2835558995787816e-05, "loss": 2.6255, "step": 1646500 }, { "epoch": 3.4476839602103357, "grad_norm": 15.741875648498535, "learning_rate": 3.283031473929753e-05, "loss": 2.5766, "step": 1647000 }, { "epoch": 3.448730615935961, "grad_norm": 12.28099250793457, "learning_rate": 3.282507048280723e-05, "loss": 2.6036, "step": 1647500 }, { "epoch": 3.4497772716615867, "grad_norm": 12.105960845947266, "learning_rate": 3.281982622631694e-05, "loss": 2.6151, "step": 1648000 }, { "epoch": 3.4508239273872126, "grad_norm": 12.637110710144043, "learning_rate": 3.281458196982665e-05, "loss": 2.6102, "step": 1648500 }, { "epoch": 3.451870583112838, "grad_norm": 14.528226852416992, "learning_rate": 3.280933771333636e-05, "loss": 2.63, "step": 1649000 }, { "epoch": 3.4529172388384635, "grad_norm": 14.05070686340332, "learning_rate": 3.280409345684606e-05, "loss": 2.5928, "step": 1649500 }, { "epoch": 3.453963894564089, "grad_norm": 17.85044288635254, "learning_rate": 3.279884920035577e-05, "loss": 2.6125, "step": 1650000 }, { "epoch": 3.4550105502897144, "grad_norm": 15.436519622802734, "learning_rate": 3.279360494386548e-05, "loss": 2.5907, "step": 1650500 }, { "epoch": 3.45605720601534, "grad_norm": 11.754314422607422, "learning_rate": 3.278836068737519e-05, "loss": 2.6085, "step": 1651000 }, { "epoch": 3.4571038617409653, "grad_norm": 12.04644775390625, "learning_rate": 3.27831164308849e-05, "loss": 2.588, "step": 1651500 }, { "epoch": 3.4581505174665907, "grad_norm": 13.786872863769531, "learning_rate": 3.2777872174394604e-05, "loss": 2.5978, "step": 1652000 }, { "epoch": 3.459197173192216, "grad_norm": 13.192914009094238, "learning_rate": 3.2772627917904315e-05, "loss": 2.6175, "step": 1652500 }, { "epoch": 3.4602438289178417, "grad_norm": 17.06948471069336, "learning_rate": 3.276738366141402e-05, "loss": 2.6228, "step": 1653000 }, { "epoch": 3.461290484643467, "grad_norm": 12.300341606140137, "learning_rate": 3.276213940492373e-05, "loss": 2.6113, "step": 1653500 }, { "epoch": 3.4623371403690926, "grad_norm": 13.911660194396973, "learning_rate": 3.2756895148433435e-05, "loss": 2.6202, "step": 1654000 }, { "epoch": 3.463383796094718, "grad_norm": 12.737122535705566, "learning_rate": 3.2751650891943146e-05, "loss": 2.609, "step": 1654500 }, { "epoch": 3.4644304518203435, "grad_norm": 13.436936378479004, "learning_rate": 3.274640663545285e-05, "loss": 2.6088, "step": 1655000 }, { "epoch": 3.465477107545969, "grad_norm": 11.9603853225708, "learning_rate": 3.274116237896256e-05, "loss": 2.5997, "step": 1655500 }, { "epoch": 3.466523763271595, "grad_norm": 17.09577178955078, "learning_rate": 3.273591812247227e-05, "loss": 2.5992, "step": 1656000 }, { "epoch": 3.4675704189972203, "grad_norm": 15.058382034301758, "learning_rate": 3.2730673865981977e-05, "loss": 2.5839, "step": 1656500 }, { "epoch": 3.4686170747228458, "grad_norm": 12.32164478302002, "learning_rate": 3.272542960949169e-05, "loss": 2.6045, "step": 1657000 }, { "epoch": 3.469663730448471, "grad_norm": 12.909139633178711, "learning_rate": 3.272018535300139e-05, "loss": 2.6251, "step": 1657500 }, { "epoch": 3.4707103861740967, "grad_norm": 11.656106948852539, "learning_rate": 3.27149410965111e-05, "loss": 2.5898, "step": 1658000 }, { "epoch": 3.471757041899722, "grad_norm": 13.364767074584961, "learning_rate": 3.270969684002081e-05, "loss": 2.6162, "step": 1658500 }, { "epoch": 3.4728036976253476, "grad_norm": 16.958356857299805, "learning_rate": 3.270445258353052e-05, "loss": 2.6043, "step": 1659000 }, { "epoch": 3.473850353350973, "grad_norm": 14.455743789672852, "learning_rate": 3.269920832704022e-05, "loss": 2.6051, "step": 1659500 }, { "epoch": 3.4748970090765985, "grad_norm": 11.698323249816895, "learning_rate": 3.269396407054994e-05, "loss": 2.6017, "step": 1660000 }, { "epoch": 3.475943664802224, "grad_norm": 15.846353530883789, "learning_rate": 3.2688719814059645e-05, "loss": 2.5826, "step": 1660500 }, { "epoch": 3.4769903205278494, "grad_norm": 12.663432121276855, "learning_rate": 3.2683475557569356e-05, "loss": 2.5875, "step": 1661000 }, { "epoch": 3.478036976253475, "grad_norm": 14.07406997680664, "learning_rate": 3.267823130107906e-05, "loss": 2.5983, "step": 1661500 }, { "epoch": 3.4790836319791003, "grad_norm": 14.327420234680176, "learning_rate": 3.2672987044588765e-05, "loss": 2.6017, "step": 1662000 }, { "epoch": 3.4801302877047258, "grad_norm": 13.958187103271484, "learning_rate": 3.2667742788098476e-05, "loss": 2.6095, "step": 1662500 }, { "epoch": 3.4811769434303512, "grad_norm": 13.578497886657715, "learning_rate": 3.266249853160818e-05, "loss": 2.5981, "step": 1663000 }, { "epoch": 3.4822235991559767, "grad_norm": 13.466096878051758, "learning_rate": 3.265725427511789e-05, "loss": 2.586, "step": 1663500 }, { "epoch": 3.483270254881602, "grad_norm": 13.936006546020508, "learning_rate": 3.26520100186276e-05, "loss": 2.5901, "step": 1664000 }, { "epoch": 3.4843169106072276, "grad_norm": 14.064535140991211, "learning_rate": 3.264676576213731e-05, "loss": 2.6006, "step": 1664500 }, { "epoch": 3.485363566332853, "grad_norm": 12.615307807922363, "learning_rate": 3.264152150564702e-05, "loss": 2.5994, "step": 1665000 }, { "epoch": 3.4864102220584785, "grad_norm": 11.98256778717041, "learning_rate": 3.263627724915673e-05, "loss": 2.59, "step": 1665500 }, { "epoch": 3.4874568777841044, "grad_norm": 12.810861587524414, "learning_rate": 3.263103299266643e-05, "loss": 2.566, "step": 1666000 }, { "epoch": 3.48850353350973, "grad_norm": 12.622410774230957, "learning_rate": 3.2625788736176144e-05, "loss": 2.5964, "step": 1666500 }, { "epoch": 3.4895501892353553, "grad_norm": 16.635482788085938, "learning_rate": 3.262054447968585e-05, "loss": 2.6054, "step": 1667000 }, { "epoch": 3.4905968449609808, "grad_norm": 12.924189567565918, "learning_rate": 3.261530022319555e-05, "loss": 2.5924, "step": 1667500 }, { "epoch": 3.4916435006866062, "grad_norm": 14.023768424987793, "learning_rate": 3.2610055966705264e-05, "loss": 2.6276, "step": 1668000 }, { "epoch": 3.4926901564122317, "grad_norm": 12.159735679626465, "learning_rate": 3.2604811710214975e-05, "loss": 2.5937, "step": 1668500 }, { "epoch": 3.493736812137857, "grad_norm": 13.796852111816406, "learning_rate": 3.2599567453724686e-05, "loss": 2.5976, "step": 1669000 }, { "epoch": 3.4947834678634826, "grad_norm": 15.730201721191406, "learning_rate": 3.259432319723439e-05, "loss": 2.6248, "step": 1669500 }, { "epoch": 3.495830123589108, "grad_norm": 13.810144424438477, "learning_rate": 3.25890789407441e-05, "loss": 2.623, "step": 1670000 }, { "epoch": 3.4968767793147335, "grad_norm": 12.595820426940918, "learning_rate": 3.2583834684253805e-05, "loss": 2.5995, "step": 1670500 }, { "epoch": 3.497923435040359, "grad_norm": 13.746564865112305, "learning_rate": 3.2578590427763517e-05, "loss": 2.5935, "step": 1671000 }, { "epoch": 3.4989700907659844, "grad_norm": 12.656621932983398, "learning_rate": 3.257334617127322e-05, "loss": 2.6142, "step": 1671500 }, { "epoch": 3.50001674649161, "grad_norm": 12.168457984924316, "learning_rate": 3.256810191478293e-05, "loss": 2.58, "step": 1672000 }, { "epoch": 3.5010634022172353, "grad_norm": 13.213780403137207, "learning_rate": 3.2562857658292636e-05, "loss": 2.5983, "step": 1672500 }, { "epoch": 3.5021100579428612, "grad_norm": 13.728453636169434, "learning_rate": 3.255761340180235e-05, "loss": 2.587, "step": 1673000 }, { "epoch": 3.5031567136684867, "grad_norm": 13.924372673034668, "learning_rate": 3.255236914531206e-05, "loss": 2.6055, "step": 1673500 }, { "epoch": 3.504203369394112, "grad_norm": 14.26647663116455, "learning_rate": 3.254712488882176e-05, "loss": 2.5981, "step": 1674000 }, { "epoch": 3.5052500251197376, "grad_norm": 15.037242889404297, "learning_rate": 3.2541880632331474e-05, "loss": 2.5941, "step": 1674500 }, { "epoch": 3.506296680845363, "grad_norm": 15.089444160461426, "learning_rate": 3.253663637584118e-05, "loss": 2.6109, "step": 1675000 }, { "epoch": 3.5073433365709885, "grad_norm": 13.225652694702148, "learning_rate": 3.253139211935089e-05, "loss": 2.6052, "step": 1675500 }, { "epoch": 3.508389992296614, "grad_norm": 12.619418144226074, "learning_rate": 3.2526147862860593e-05, "loss": 2.5988, "step": 1676000 }, { "epoch": 3.5094366480222394, "grad_norm": 14.013951301574707, "learning_rate": 3.2520903606370305e-05, "loss": 2.5932, "step": 1676500 }, { "epoch": 3.510483303747865, "grad_norm": 14.34852123260498, "learning_rate": 3.251565934988001e-05, "loss": 2.5868, "step": 1677000 }, { "epoch": 3.5115299594734903, "grad_norm": 14.81576919555664, "learning_rate": 3.251041509338973e-05, "loss": 2.612, "step": 1677500 }, { "epoch": 3.512576615199116, "grad_norm": 13.414228439331055, "learning_rate": 3.250517083689943e-05, "loss": 2.6074, "step": 1678000 }, { "epoch": 3.5136232709247412, "grad_norm": 13.292221069335938, "learning_rate": 3.249992658040914e-05, "loss": 2.5804, "step": 1678500 }, { "epoch": 3.5146699266503667, "grad_norm": 14.084498405456543, "learning_rate": 3.2494682323918846e-05, "loss": 2.6022, "step": 1679000 }, { "epoch": 3.515716582375992, "grad_norm": 17.237951278686523, "learning_rate": 3.248943806742855e-05, "loss": 2.6218, "step": 1679500 }, { "epoch": 3.5167632381016176, "grad_norm": 14.735353469848633, "learning_rate": 3.248419381093826e-05, "loss": 2.5707, "step": 1680000 }, { "epoch": 3.517809893827243, "grad_norm": 12.951642036437988, "learning_rate": 3.2478949554447966e-05, "loss": 2.6157, "step": 1680500 }, { "epoch": 3.5188565495528685, "grad_norm": 13.806337356567383, "learning_rate": 3.247370529795768e-05, "loss": 2.5891, "step": 1681000 }, { "epoch": 3.519903205278494, "grad_norm": 12.432610511779785, "learning_rate": 3.246846104146739e-05, "loss": 2.6192, "step": 1681500 }, { "epoch": 3.5209498610041194, "grad_norm": 16.44268226623535, "learning_rate": 3.24632167849771e-05, "loss": 2.6176, "step": 1682000 }, { "epoch": 3.521996516729745, "grad_norm": 17.549606323242188, "learning_rate": 3.2457972528486804e-05, "loss": 2.6037, "step": 1682500 }, { "epoch": 3.5230431724553704, "grad_norm": 13.574739456176758, "learning_rate": 3.2452728271996515e-05, "loss": 2.5947, "step": 1683000 }, { "epoch": 3.524089828180996, "grad_norm": 13.657498359680176, "learning_rate": 3.244748401550622e-05, "loss": 2.5965, "step": 1683500 }, { "epoch": 3.5251364839066217, "grad_norm": 18.878150939941406, "learning_rate": 3.244223975901593e-05, "loss": 2.5943, "step": 1684000 }, { "epoch": 3.526183139632247, "grad_norm": 14.861844062805176, "learning_rate": 3.2436995502525634e-05, "loss": 2.5894, "step": 1684500 }, { "epoch": 3.5272297953578726, "grad_norm": 13.43409538269043, "learning_rate": 3.243175124603534e-05, "loss": 2.611, "step": 1685000 }, { "epoch": 3.528276451083498, "grad_norm": 15.96921443939209, "learning_rate": 3.242650698954505e-05, "loss": 2.6072, "step": 1685500 }, { "epoch": 3.5293231068091235, "grad_norm": 12.871570587158203, "learning_rate": 3.242126273305476e-05, "loss": 2.615, "step": 1686000 }, { "epoch": 3.530369762534749, "grad_norm": 12.771160125732422, "learning_rate": 3.241601847656447e-05, "loss": 2.5805, "step": 1686500 }, { "epoch": 3.5314164182603744, "grad_norm": 14.078465461730957, "learning_rate": 3.2410774220074176e-05, "loss": 2.5955, "step": 1687000 }, { "epoch": 3.532463073986, "grad_norm": 14.345671653747559, "learning_rate": 3.240552996358389e-05, "loss": 2.6156, "step": 1687500 }, { "epoch": 3.5335097297116254, "grad_norm": 12.45083236694336, "learning_rate": 3.240028570709359e-05, "loss": 2.6094, "step": 1688000 }, { "epoch": 3.534556385437251, "grad_norm": 13.296991348266602, "learning_rate": 3.23950414506033e-05, "loss": 2.6099, "step": 1688500 }, { "epoch": 3.5356030411628763, "grad_norm": 16.480546951293945, "learning_rate": 3.238979719411301e-05, "loss": 2.5886, "step": 1689000 }, { "epoch": 3.5366496968885017, "grad_norm": 14.165996551513672, "learning_rate": 3.238455293762272e-05, "loss": 2.6082, "step": 1689500 }, { "epoch": 3.537696352614127, "grad_norm": 12.516648292541504, "learning_rate": 3.237930868113242e-05, "loss": 2.5917, "step": 1690000 }, { "epoch": 3.538743008339753, "grad_norm": 16.037437438964844, "learning_rate": 3.2374064424642133e-05, "loss": 2.5937, "step": 1690500 }, { "epoch": 3.5397896640653785, "grad_norm": 15.325817108154297, "learning_rate": 3.2368820168151845e-05, "loss": 2.5894, "step": 1691000 }, { "epoch": 3.540836319791004, "grad_norm": 13.67625617980957, "learning_rate": 3.236357591166155e-05, "loss": 2.5888, "step": 1691500 }, { "epoch": 3.5418829755166295, "grad_norm": 13.257515907287598, "learning_rate": 3.235833165517126e-05, "loss": 2.5913, "step": 1692000 }, { "epoch": 3.542929631242255, "grad_norm": 13.20632266998291, "learning_rate": 3.2353087398680964e-05, "loss": 2.6087, "step": 1692500 }, { "epoch": 3.5439762869678804, "grad_norm": 14.706798553466797, "learning_rate": 3.2347843142190675e-05, "loss": 2.586, "step": 1693000 }, { "epoch": 3.545022942693506, "grad_norm": 15.900867462158203, "learning_rate": 3.234259888570038e-05, "loss": 2.6121, "step": 1693500 }, { "epoch": 3.5460695984191313, "grad_norm": 14.35329818725586, "learning_rate": 3.233735462921009e-05, "loss": 2.607, "step": 1694000 }, { "epoch": 3.5471162541447567, "grad_norm": 23.70778465270996, "learning_rate": 3.2332110372719795e-05, "loss": 2.5837, "step": 1694500 }, { "epoch": 3.548162909870382, "grad_norm": 13.612796783447266, "learning_rate": 3.232686611622951e-05, "loss": 2.5934, "step": 1695000 }, { "epoch": 3.5492095655960076, "grad_norm": 16.193683624267578, "learning_rate": 3.232162185973922e-05, "loss": 2.6171, "step": 1695500 }, { "epoch": 3.550256221321633, "grad_norm": 13.609675407409668, "learning_rate": 3.231637760324892e-05, "loss": 2.5915, "step": 1696000 }, { "epoch": 3.5513028770472586, "grad_norm": 12.827301025390625, "learning_rate": 3.231113334675863e-05, "loss": 2.5984, "step": 1696500 }, { "epoch": 3.552349532772884, "grad_norm": 14.130860328674316, "learning_rate": 3.230588909026834e-05, "loss": 2.5998, "step": 1697000 }, { "epoch": 3.5533961884985095, "grad_norm": 13.291101455688477, "learning_rate": 3.230064483377805e-05, "loss": 2.5802, "step": 1697500 }, { "epoch": 3.554442844224135, "grad_norm": 13.908503532409668, "learning_rate": 3.229540057728775e-05, "loss": 2.6048, "step": 1698000 }, { "epoch": 3.5554894999497604, "grad_norm": 12.723422050476074, "learning_rate": 3.229015632079746e-05, "loss": 2.587, "step": 1698500 }, { "epoch": 3.556536155675386, "grad_norm": 14.541149139404297, "learning_rate": 3.2284912064307174e-05, "loss": 2.5923, "step": 1699000 }, { "epoch": 3.5575828114010113, "grad_norm": 14.990803718566895, "learning_rate": 3.2279667807816885e-05, "loss": 2.5984, "step": 1699500 }, { "epoch": 3.5586294671266367, "grad_norm": 14.923602104187012, "learning_rate": 3.227442355132659e-05, "loss": 2.5843, "step": 1700000 }, { "epoch": 3.559676122852262, "grad_norm": 12.285858154296875, "learning_rate": 3.22691792948363e-05, "loss": 2.6165, "step": 1700500 }, { "epoch": 3.5607227785778877, "grad_norm": 16.03978157043457, "learning_rate": 3.2263935038346005e-05, "loss": 2.6009, "step": 1701000 }, { "epoch": 3.5617694343035136, "grad_norm": 12.184081077575684, "learning_rate": 3.225869078185571e-05, "loss": 2.61, "step": 1701500 }, { "epoch": 3.562816090029139, "grad_norm": 13.065695762634277, "learning_rate": 3.225344652536542e-05, "loss": 2.587, "step": 1702000 }, { "epoch": 3.5638627457547645, "grad_norm": 12.607295989990234, "learning_rate": 3.2248202268875125e-05, "loss": 2.5982, "step": 1702500 }, { "epoch": 3.56490940148039, "grad_norm": 14.270147323608398, "learning_rate": 3.2242958012384836e-05, "loss": 2.6032, "step": 1703000 }, { "epoch": 3.5659560572060154, "grad_norm": 13.571681022644043, "learning_rate": 3.223771375589455e-05, "loss": 2.5847, "step": 1703500 }, { "epoch": 3.567002712931641, "grad_norm": 16.178979873657227, "learning_rate": 3.223246949940426e-05, "loss": 2.5924, "step": 1704000 }, { "epoch": 3.5680493686572663, "grad_norm": 12.266412734985352, "learning_rate": 3.222722524291396e-05, "loss": 2.5919, "step": 1704500 }, { "epoch": 3.5690960243828918, "grad_norm": 13.584173202514648, "learning_rate": 3.2221980986423673e-05, "loss": 2.5856, "step": 1705000 }, { "epoch": 3.570142680108517, "grad_norm": 14.515880584716797, "learning_rate": 3.221673672993338e-05, "loss": 2.6068, "step": 1705500 }, { "epoch": 3.5711893358341427, "grad_norm": 15.109379768371582, "learning_rate": 3.221149247344309e-05, "loss": 2.6032, "step": 1706000 }, { "epoch": 3.572235991559768, "grad_norm": 13.782330513000488, "learning_rate": 3.220624821695279e-05, "loss": 2.5968, "step": 1706500 }, { "epoch": 3.5732826472853936, "grad_norm": 13.429868698120117, "learning_rate": 3.2201003960462504e-05, "loss": 2.5897, "step": 1707000 }, { "epoch": 3.574329303011019, "grad_norm": 13.310483932495117, "learning_rate": 3.219575970397221e-05, "loss": 2.5882, "step": 1707500 }, { "epoch": 3.575375958736645, "grad_norm": 13.980984687805176, "learning_rate": 3.219051544748192e-05, "loss": 2.596, "step": 1708000 }, { "epoch": 3.5764226144622704, "grad_norm": 13.173625946044922, "learning_rate": 3.218527119099163e-05, "loss": 2.5862, "step": 1708500 }, { "epoch": 3.577469270187896, "grad_norm": 15.895495414733887, "learning_rate": 3.2180026934501335e-05, "loss": 2.5958, "step": 1709000 }, { "epoch": 3.5785159259135213, "grad_norm": 14.062317848205566, "learning_rate": 3.2174782678011046e-05, "loss": 2.6121, "step": 1709500 }, { "epoch": 3.5795625816391468, "grad_norm": 13.754453659057617, "learning_rate": 3.216953842152075e-05, "loss": 2.5766, "step": 1710000 }, { "epoch": 3.580609237364772, "grad_norm": 13.152395248413086, "learning_rate": 3.216429416503046e-05, "loss": 2.5978, "step": 1710500 }, { "epoch": 3.5816558930903977, "grad_norm": 14.711223602294922, "learning_rate": 3.2159049908540166e-05, "loss": 2.5972, "step": 1711000 }, { "epoch": 3.582702548816023, "grad_norm": 13.906702041625977, "learning_rate": 3.215380565204988e-05, "loss": 2.6118, "step": 1711500 }, { "epoch": 3.5837492045416486, "grad_norm": 12.272989273071289, "learning_rate": 3.214856139555958e-05, "loss": 2.6177, "step": 1712000 }, { "epoch": 3.584795860267274, "grad_norm": 14.974386215209961, "learning_rate": 3.214331713906929e-05, "loss": 2.5991, "step": 1712500 }, { "epoch": 3.5858425159928995, "grad_norm": 13.106512069702148, "learning_rate": 3.2138072882579e-05, "loss": 2.6091, "step": 1713000 }, { "epoch": 3.586889171718525, "grad_norm": 13.144477844238281, "learning_rate": 3.213282862608871e-05, "loss": 2.5789, "step": 1713500 }, { "epoch": 3.5879358274441504, "grad_norm": 14.32947826385498, "learning_rate": 3.212758436959842e-05, "loss": 2.5869, "step": 1714000 }, { "epoch": 3.588982483169776, "grad_norm": 13.698223114013672, "learning_rate": 3.212234011310812e-05, "loss": 2.5836, "step": 1714500 }, { "epoch": 3.5900291388954013, "grad_norm": 12.452990531921387, "learning_rate": 3.2117095856617834e-05, "loss": 2.5996, "step": 1715000 }, { "epoch": 3.5910757946210268, "grad_norm": 13.32016658782959, "learning_rate": 3.211185160012754e-05, "loss": 2.61, "step": 1715500 }, { "epoch": 3.5921224503466522, "grad_norm": 12.291687965393066, "learning_rate": 3.210660734363725e-05, "loss": 2.596, "step": 1716000 }, { "epoch": 3.5931691060722777, "grad_norm": 14.473573684692383, "learning_rate": 3.210136308714696e-05, "loss": 2.5876, "step": 1716500 }, { "epoch": 3.594215761797903, "grad_norm": 15.208059310913086, "learning_rate": 3.209611883065667e-05, "loss": 2.5846, "step": 1717000 }, { "epoch": 3.5952624175235286, "grad_norm": 15.44985294342041, "learning_rate": 3.2090874574166376e-05, "loss": 2.5872, "step": 1717500 }, { "epoch": 3.596309073249154, "grad_norm": 14.194807052612305, "learning_rate": 3.208563031767609e-05, "loss": 2.6077, "step": 1718000 }, { "epoch": 3.5973557289747795, "grad_norm": 14.652132034301758, "learning_rate": 3.208038606118579e-05, "loss": 2.6117, "step": 1718500 }, { "epoch": 3.5984023847004054, "grad_norm": 14.383037567138672, "learning_rate": 3.2075141804695496e-05, "loss": 2.5688, "step": 1719000 }, { "epoch": 3.599449040426031, "grad_norm": 14.762117385864258, "learning_rate": 3.206989754820521e-05, "loss": 2.5923, "step": 1719500 }, { "epoch": 3.6004956961516563, "grad_norm": 14.482673645019531, "learning_rate": 3.206465329171491e-05, "loss": 2.6073, "step": 1720000 }, { "epoch": 3.6015423518772818, "grad_norm": 14.727602005004883, "learning_rate": 3.205940903522462e-05, "loss": 2.5726, "step": 1720500 }, { "epoch": 3.6025890076029072, "grad_norm": 14.976184844970703, "learning_rate": 3.205416477873433e-05, "loss": 2.6069, "step": 1721000 }, { "epoch": 3.6036356633285327, "grad_norm": 19.55562400817871, "learning_rate": 3.2048920522244044e-05, "loss": 2.5938, "step": 1721500 }, { "epoch": 3.604682319054158, "grad_norm": 22.530197143554688, "learning_rate": 3.204367626575375e-05, "loss": 2.5696, "step": 1722000 }, { "epoch": 3.6057289747797836, "grad_norm": 15.888647079467773, "learning_rate": 3.203843200926346e-05, "loss": 2.5885, "step": 1722500 }, { "epoch": 3.606775630505409, "grad_norm": 16.848115921020508, "learning_rate": 3.2033187752773164e-05, "loss": 2.5752, "step": 1723000 }, { "epoch": 3.6078222862310345, "grad_norm": 14.696463584899902, "learning_rate": 3.2027943496282875e-05, "loss": 2.5918, "step": 1723500 }, { "epoch": 3.60886894195666, "grad_norm": 13.464041709899902, "learning_rate": 3.202269923979258e-05, "loss": 2.5885, "step": 1724000 }, { "epoch": 3.6099155976822854, "grad_norm": 12.038375854492188, "learning_rate": 3.2017454983302284e-05, "loss": 2.6176, "step": 1724500 }, { "epoch": 3.610962253407911, "grad_norm": 12.097074508666992, "learning_rate": 3.2012210726811995e-05, "loss": 2.6356, "step": 1725000 }, { "epoch": 3.612008909133537, "grad_norm": 16.491918563842773, "learning_rate": 3.2006966470321706e-05, "loss": 2.601, "step": 1725500 }, { "epoch": 3.6130555648591622, "grad_norm": 13.75329303741455, "learning_rate": 3.200172221383142e-05, "loss": 2.5908, "step": 1726000 }, { "epoch": 3.6141022205847877, "grad_norm": 15.619519233703613, "learning_rate": 3.199647795734112e-05, "loss": 2.5756, "step": 1726500 }, { "epoch": 3.615148876310413, "grad_norm": 13.012346267700195, "learning_rate": 3.199123370085083e-05, "loss": 2.6049, "step": 1727000 }, { "epoch": 3.6161955320360386, "grad_norm": 16.528045654296875, "learning_rate": 3.1985989444360536e-05, "loss": 2.5876, "step": 1727500 }, { "epoch": 3.617242187761664, "grad_norm": 14.812028884887695, "learning_rate": 3.198074518787025e-05, "loss": 2.5774, "step": 1728000 }, { "epoch": 3.6182888434872895, "grad_norm": 13.58335018157959, "learning_rate": 3.197550093137995e-05, "loss": 2.5959, "step": 1728500 }, { "epoch": 3.619335499212915, "grad_norm": 14.324935913085938, "learning_rate": 3.197025667488966e-05, "loss": 2.5938, "step": 1729000 }, { "epoch": 3.6203821549385404, "grad_norm": 16.282251358032227, "learning_rate": 3.196501241839937e-05, "loss": 2.5811, "step": 1729500 }, { "epoch": 3.621428810664166, "grad_norm": 15.276224136352539, "learning_rate": 3.195976816190908e-05, "loss": 2.6069, "step": 1730000 }, { "epoch": 3.6224754663897913, "grad_norm": 14.444965362548828, "learning_rate": 3.195452390541879e-05, "loss": 2.5828, "step": 1730500 }, { "epoch": 3.623522122115417, "grad_norm": 15.191883087158203, "learning_rate": 3.1949279648928494e-05, "loss": 2.5941, "step": 1731000 }, { "epoch": 3.6245687778410423, "grad_norm": 15.795991897583008, "learning_rate": 3.1944035392438205e-05, "loss": 2.5691, "step": 1731500 }, { "epoch": 3.6256154335666677, "grad_norm": 13.04241943359375, "learning_rate": 3.193879113594791e-05, "loss": 2.5772, "step": 1732000 }, { "epoch": 3.626662089292293, "grad_norm": 14.215209007263184, "learning_rate": 3.193354687945762e-05, "loss": 2.5994, "step": 1732500 }, { "epoch": 3.6277087450179186, "grad_norm": 13.740547180175781, "learning_rate": 3.1928302622967324e-05, "loss": 2.5818, "step": 1733000 }, { "epoch": 3.628755400743544, "grad_norm": 12.859944343566895, "learning_rate": 3.1923058366477036e-05, "loss": 2.5848, "step": 1733500 }, { "epoch": 3.6298020564691695, "grad_norm": 16.395559310913086, "learning_rate": 3.191781410998675e-05, "loss": 2.5857, "step": 1734000 }, { "epoch": 3.630848712194795, "grad_norm": 13.476812362670898, "learning_rate": 3.191256985349646e-05, "loss": 2.5809, "step": 1734500 }, { "epoch": 3.6318953679204204, "grad_norm": 13.61968994140625, "learning_rate": 3.190732559700616e-05, "loss": 2.5793, "step": 1735000 }, { "epoch": 3.632942023646046, "grad_norm": 15.007518768310547, "learning_rate": 3.1902081340515866e-05, "loss": 2.5993, "step": 1735500 }, { "epoch": 3.6339886793716714, "grad_norm": 15.614906311035156, "learning_rate": 3.189683708402558e-05, "loss": 2.6056, "step": 1736000 }, { "epoch": 3.6350353350972973, "grad_norm": 14.400757789611816, "learning_rate": 3.189159282753528e-05, "loss": 2.5778, "step": 1736500 }, { "epoch": 3.6360819908229227, "grad_norm": 11.781644821166992, "learning_rate": 3.188634857104499e-05, "loss": 2.5716, "step": 1737000 }, { "epoch": 3.637128646548548, "grad_norm": 15.543213844299316, "learning_rate": 3.18811043145547e-05, "loss": 2.6004, "step": 1737500 }, { "epoch": 3.6381753022741736, "grad_norm": 13.557723045349121, "learning_rate": 3.187586005806441e-05, "loss": 2.6105, "step": 1738000 }, { "epoch": 3.639221957999799, "grad_norm": 16.357254028320312, "learning_rate": 3.187061580157412e-05, "loss": 2.5912, "step": 1738500 }, { "epoch": 3.6402686137254245, "grad_norm": 11.835071563720703, "learning_rate": 3.186537154508383e-05, "loss": 2.5712, "step": 1739000 }, { "epoch": 3.64131526945105, "grad_norm": 16.11223030090332, "learning_rate": 3.1860127288593535e-05, "loss": 2.5936, "step": 1739500 }, { "epoch": 3.6423619251766755, "grad_norm": 14.605691909790039, "learning_rate": 3.1854883032103246e-05, "loss": 2.5775, "step": 1740000 }, { "epoch": 3.643408580902301, "grad_norm": 16.26837921142578, "learning_rate": 3.184963877561295e-05, "loss": 2.5854, "step": 1740500 }, { "epoch": 3.6444552366279264, "grad_norm": 14.0577974319458, "learning_rate": 3.184439451912266e-05, "loss": 2.593, "step": 1741000 }, { "epoch": 3.645501892353552, "grad_norm": 15.501860618591309, "learning_rate": 3.1839150262632365e-05, "loss": 2.5873, "step": 1741500 }, { "epoch": 3.6465485480791773, "grad_norm": 14.678506851196289, "learning_rate": 3.183390600614207e-05, "loss": 2.6044, "step": 1742000 }, { "epoch": 3.6475952038048027, "grad_norm": 13.392659187316895, "learning_rate": 3.182866174965178e-05, "loss": 2.5819, "step": 1742500 }, { "epoch": 3.6486418595304286, "grad_norm": Infinity, "learning_rate": 3.182341749316149e-05, "loss": 2.5839, "step": 1743000 }, { "epoch": 3.649688515256054, "grad_norm": 14.40477466583252, "learning_rate": 3.18181732366712e-05, "loss": 2.5817, "step": 1743500 }, { "epoch": 3.6507351709816795, "grad_norm": 18.249271392822266, "learning_rate": 3.181292898018091e-05, "loss": 2.569, "step": 1744000 }, { "epoch": 3.651781826707305, "grad_norm": 13.578378677368164, "learning_rate": 3.180768472369062e-05, "loss": 2.5926, "step": 1744500 }, { "epoch": 3.6528284824329305, "grad_norm": 13.351151466369629, "learning_rate": 3.180244046720032e-05, "loss": 2.5874, "step": 1745000 }, { "epoch": 3.653875138158556, "grad_norm": 13.632919311523438, "learning_rate": 3.1797196210710034e-05, "loss": 2.581, "step": 1745500 }, { "epoch": 3.6549217938841814, "grad_norm": 12.948294639587402, "learning_rate": 3.179195195421974e-05, "loss": 2.5924, "step": 1746000 }, { "epoch": 3.655968449609807, "grad_norm": 14.198321342468262, "learning_rate": 3.178670769772945e-05, "loss": 2.5815, "step": 1746500 }, { "epoch": 3.6570151053354323, "grad_norm": 15.514403343200684, "learning_rate": 3.178146344123915e-05, "loss": 2.6079, "step": 1747000 }, { "epoch": 3.6580617610610577, "grad_norm": 13.41122817993164, "learning_rate": 3.1776219184748864e-05, "loss": 2.5678, "step": 1747500 }, { "epoch": 3.659108416786683, "grad_norm": 12.33975887298584, "learning_rate": 3.1770974928258576e-05, "loss": 2.571, "step": 1748000 }, { "epoch": 3.6601550725123086, "grad_norm": 14.509282112121582, "learning_rate": 3.176573067176828e-05, "loss": 2.5934, "step": 1748500 }, { "epoch": 3.661201728237934, "grad_norm": 13.794660568237305, "learning_rate": 3.176048641527799e-05, "loss": 2.5989, "step": 1749000 }, { "epoch": 3.6622483839635596, "grad_norm": 13.711329460144043, "learning_rate": 3.1755242158787695e-05, "loss": 2.5797, "step": 1749500 }, { "epoch": 3.663295039689185, "grad_norm": 20.03778839111328, "learning_rate": 3.1749997902297406e-05, "loss": 2.5877, "step": 1750000 }, { "epoch": 3.6643416954148105, "grad_norm": 24.76704978942871, "learning_rate": 3.174475364580711e-05, "loss": 2.5864, "step": 1750500 }, { "epoch": 3.665388351140436, "grad_norm": 15.962411880493164, "learning_rate": 3.173950938931682e-05, "loss": 2.5974, "step": 1751000 }, { "epoch": 3.6664350068660614, "grad_norm": 13.591133117675781, "learning_rate": 3.173426513282653e-05, "loss": 2.5833, "step": 1751500 }, { "epoch": 3.667481662591687, "grad_norm": 16.576425552368164, "learning_rate": 3.1729020876336244e-05, "loss": 2.5902, "step": 1752000 }, { "epoch": 3.6685283183173123, "grad_norm": 15.93233871459961, "learning_rate": 3.172377661984595e-05, "loss": 2.5899, "step": 1752500 }, { "epoch": 3.6695749740429378, "grad_norm": 13.08527660369873, "learning_rate": 3.171853236335565e-05, "loss": 2.603, "step": 1753000 }, { "epoch": 3.670621629768563, "grad_norm": 16.046953201293945, "learning_rate": 3.1713288106865364e-05, "loss": 2.5853, "step": 1753500 }, { "epoch": 3.671668285494189, "grad_norm": 14.021008491516113, "learning_rate": 3.170804385037507e-05, "loss": 2.5961, "step": 1754000 }, { "epoch": 3.6727149412198146, "grad_norm": 12.73055362701416, "learning_rate": 3.170279959388478e-05, "loss": 2.5937, "step": 1754500 }, { "epoch": 3.67376159694544, "grad_norm": 14.143796920776367, "learning_rate": 3.169755533739448e-05, "loss": 2.6025, "step": 1755000 }, { "epoch": 3.6748082526710655, "grad_norm": 14.15062141418457, "learning_rate": 3.1692311080904194e-05, "loss": 2.5707, "step": 1755500 }, { "epoch": 3.675854908396691, "grad_norm": 14.609033584594727, "learning_rate": 3.1687066824413905e-05, "loss": 2.5822, "step": 1756000 }, { "epoch": 3.6769015641223164, "grad_norm": 14.139880180358887, "learning_rate": 3.1681822567923616e-05, "loss": 2.5819, "step": 1756500 }, { "epoch": 3.677948219847942, "grad_norm": 13.588831901550293, "learning_rate": 3.167657831143332e-05, "loss": 2.5918, "step": 1757000 }, { "epoch": 3.6789948755735673, "grad_norm": 13.67131519317627, "learning_rate": 3.167133405494303e-05, "loss": 2.5863, "step": 1757500 }, { "epoch": 3.6800415312991928, "grad_norm": 13.312274932861328, "learning_rate": 3.1666089798452736e-05, "loss": 2.585, "step": 1758000 }, { "epoch": 3.681088187024818, "grad_norm": 12.12417221069336, "learning_rate": 3.166084554196244e-05, "loss": 2.5891, "step": 1758500 }, { "epoch": 3.6821348427504437, "grad_norm": 17.635364532470703, "learning_rate": 3.165560128547215e-05, "loss": 2.592, "step": 1759000 }, { "epoch": 3.683181498476069, "grad_norm": 12.478246688842773, "learning_rate": 3.1650357028981856e-05, "loss": 2.5893, "step": 1759500 }, { "epoch": 3.6842281542016946, "grad_norm": 12.639230728149414, "learning_rate": 3.164511277249157e-05, "loss": 2.5816, "step": 1760000 }, { "epoch": 3.6852748099273205, "grad_norm": 12.455080032348633, "learning_rate": 3.163986851600128e-05, "loss": 2.5849, "step": 1760500 }, { "epoch": 3.686321465652946, "grad_norm": 12.427542686462402, "learning_rate": 3.163462425951099e-05, "loss": 2.581, "step": 1761000 }, { "epoch": 3.6873681213785714, "grad_norm": 14.894560813903809, "learning_rate": 3.162938000302069e-05, "loss": 2.5962, "step": 1761500 }, { "epoch": 3.688414777104197, "grad_norm": 14.455601692199707, "learning_rate": 3.1624135746530404e-05, "loss": 2.5609, "step": 1762000 }, { "epoch": 3.6894614328298223, "grad_norm": 12.400545120239258, "learning_rate": 3.161889149004011e-05, "loss": 2.5843, "step": 1762500 }, { "epoch": 3.6905080885554478, "grad_norm": 14.95891284942627, "learning_rate": 3.161364723354982e-05, "loss": 2.5849, "step": 1763000 }, { "epoch": 3.691554744281073, "grad_norm": 13.404298782348633, "learning_rate": 3.1608402977059524e-05, "loss": 2.5796, "step": 1763500 }, { "epoch": 3.6926014000066987, "grad_norm": 12.484013557434082, "learning_rate": 3.160315872056923e-05, "loss": 2.5879, "step": 1764000 }, { "epoch": 3.693648055732324, "grad_norm": 12.78219223022461, "learning_rate": 3.159791446407894e-05, "loss": 2.5769, "step": 1764500 }, { "epoch": 3.6946947114579496, "grad_norm": 15.120111465454102, "learning_rate": 3.159267020758865e-05, "loss": 2.5879, "step": 1765000 }, { "epoch": 3.695741367183575, "grad_norm": 14.745562553405762, "learning_rate": 3.158742595109836e-05, "loss": 2.5892, "step": 1765500 }, { "epoch": 3.6967880229092005, "grad_norm": 13.9267578125, "learning_rate": 3.1582181694608066e-05, "loss": 2.5851, "step": 1766000 }, { "epoch": 3.697834678634826, "grad_norm": 14.263707160949707, "learning_rate": 3.157693743811778e-05, "loss": 2.5875, "step": 1766500 }, { "epoch": 3.6988813343604514, "grad_norm": 13.49354076385498, "learning_rate": 3.157169318162748e-05, "loss": 2.5808, "step": 1767000 }, { "epoch": 3.699927990086077, "grad_norm": 13.225556373596191, "learning_rate": 3.156644892513719e-05, "loss": 2.5567, "step": 1767500 }, { "epoch": 3.7009746458117023, "grad_norm": 11.543597221374512, "learning_rate": 3.15612046686469e-05, "loss": 2.5791, "step": 1768000 }, { "epoch": 3.702021301537328, "grad_norm": 12.241515159606934, "learning_rate": 3.155596041215661e-05, "loss": 2.5776, "step": 1768500 }, { "epoch": 3.7030679572629532, "grad_norm": 14.279908180236816, "learning_rate": 3.155071615566632e-05, "loss": 2.6078, "step": 1769000 }, { "epoch": 3.7041146129885787, "grad_norm": 18.389163970947266, "learning_rate": 3.154547189917602e-05, "loss": 2.5603, "step": 1769500 }, { "epoch": 3.705161268714204, "grad_norm": 12.67796802520752, "learning_rate": 3.1540227642685734e-05, "loss": 2.5834, "step": 1770000 }, { "epoch": 3.7062079244398296, "grad_norm": 15.084142684936523, "learning_rate": 3.153498338619544e-05, "loss": 2.567, "step": 1770500 }, { "epoch": 3.707254580165455, "grad_norm": 13.359902381896973, "learning_rate": 3.152973912970515e-05, "loss": 2.5915, "step": 1771000 }, { "epoch": 3.708301235891081, "grad_norm": 16.855789184570312, "learning_rate": 3.1524494873214854e-05, "loss": 2.5715, "step": 1771500 }, { "epoch": 3.7093478916167064, "grad_norm": 14.20744800567627, "learning_rate": 3.1519250616724565e-05, "loss": 2.5586, "step": 1772000 }, { "epoch": 3.710394547342332, "grad_norm": 15.38685417175293, "learning_rate": 3.151400636023427e-05, "loss": 2.5767, "step": 1772500 }, { "epoch": 3.7114412030679573, "grad_norm": 12.646074295043945, "learning_rate": 3.150876210374398e-05, "loss": 2.5812, "step": 1773000 }, { "epoch": 3.712487858793583, "grad_norm": 13.16985034942627, "learning_rate": 3.150351784725369e-05, "loss": 2.5889, "step": 1773500 }, { "epoch": 3.7135345145192082, "grad_norm": 14.329103469848633, "learning_rate": 3.14982735907634e-05, "loss": 2.5753, "step": 1774000 }, { "epoch": 3.7145811702448337, "grad_norm": 22.830825805664062, "learning_rate": 3.149302933427311e-05, "loss": 2.5823, "step": 1774500 }, { "epoch": 3.715627825970459, "grad_norm": 13.31597900390625, "learning_rate": 3.148778507778282e-05, "loss": 2.6003, "step": 1775000 }, { "epoch": 3.7166744816960846, "grad_norm": 13.899335861206055, "learning_rate": 3.148254082129252e-05, "loss": 2.5762, "step": 1775500 }, { "epoch": 3.71772113742171, "grad_norm": 14.068328857421875, "learning_rate": 3.1477296564802227e-05, "loss": 2.5809, "step": 1776000 }, { "epoch": 3.7187677931473355, "grad_norm": 13.692569732666016, "learning_rate": 3.147205230831194e-05, "loss": 2.6048, "step": 1776500 }, { "epoch": 3.719814448872961, "grad_norm": 15.814990997314453, "learning_rate": 3.146680805182164e-05, "loss": 2.5674, "step": 1777000 }, { "epoch": 3.7208611045985864, "grad_norm": 15.538422584533691, "learning_rate": 3.146156379533135e-05, "loss": 2.5719, "step": 1777500 }, { "epoch": 3.7219077603242123, "grad_norm": 14.22352123260498, "learning_rate": 3.1456319538841064e-05, "loss": 2.5827, "step": 1778000 }, { "epoch": 3.722954416049838, "grad_norm": 13.84058952331543, "learning_rate": 3.1451075282350775e-05, "loss": 2.5818, "step": 1778500 }, { "epoch": 3.7240010717754632, "grad_norm": 14.832014083862305, "learning_rate": 3.144583102586048e-05, "loss": 2.5752, "step": 1779000 }, { "epoch": 3.7250477275010887, "grad_norm": 15.17911148071289, "learning_rate": 3.144058676937019e-05, "loss": 2.572, "step": 1779500 }, { "epoch": 3.726094383226714, "grad_norm": 11.48369312286377, "learning_rate": 3.1435342512879895e-05, "loss": 2.6064, "step": 1780000 }, { "epoch": 3.7271410389523396, "grad_norm": 13.596168518066406, "learning_rate": 3.1430098256389606e-05, "loss": 2.5881, "step": 1780500 }, { "epoch": 3.728187694677965, "grad_norm": 14.576251029968262, "learning_rate": 3.142485399989931e-05, "loss": 2.5671, "step": 1781000 }, { "epoch": 3.7292343504035905, "grad_norm": 18.06085205078125, "learning_rate": 3.1419609743409015e-05, "loss": 2.5763, "step": 1781500 }, { "epoch": 3.730281006129216, "grad_norm": 14.155096054077148, "learning_rate": 3.1414365486918726e-05, "loss": 2.5801, "step": 1782000 }, { "epoch": 3.7313276618548414, "grad_norm": 15.735763549804688, "learning_rate": 3.140912123042844e-05, "loss": 2.5787, "step": 1782500 }, { "epoch": 3.732374317580467, "grad_norm": 13.462231636047363, "learning_rate": 3.140387697393815e-05, "loss": 2.5677, "step": 1783000 }, { "epoch": 3.7334209733060923, "grad_norm": 13.170134544372559, "learning_rate": 3.139863271744785e-05, "loss": 2.5884, "step": 1783500 }, { "epoch": 3.734467629031718, "grad_norm": 14.016874313354492, "learning_rate": 3.139338846095756e-05, "loss": 2.5823, "step": 1784000 }, { "epoch": 3.7355142847573433, "grad_norm": 17.48088264465332, "learning_rate": 3.138814420446727e-05, "loss": 2.5996, "step": 1784500 }, { "epoch": 3.7365609404829687, "grad_norm": 15.555214881896973, "learning_rate": 3.138289994797698e-05, "loss": 2.5924, "step": 1785000 }, { "epoch": 3.737607596208594, "grad_norm": 14.341889381408691, "learning_rate": 3.137765569148668e-05, "loss": 2.5899, "step": 1785500 }, { "epoch": 3.7386542519342196, "grad_norm": 14.010653495788574, "learning_rate": 3.1372411434996394e-05, "loss": 2.5655, "step": 1786000 }, { "epoch": 3.739700907659845, "grad_norm": 12.922472953796387, "learning_rate": 3.1367167178506105e-05, "loss": 2.5948, "step": 1786500 }, { "epoch": 3.7407475633854705, "grad_norm": 12.922473907470703, "learning_rate": 3.136192292201581e-05, "loss": 2.6007, "step": 1787000 }, { "epoch": 3.741794219111096, "grad_norm": 14.912527084350586, "learning_rate": 3.135667866552552e-05, "loss": 2.5851, "step": 1787500 }, { "epoch": 3.7428408748367215, "grad_norm": 12.011048316955566, "learning_rate": 3.1351434409035225e-05, "loss": 2.5925, "step": 1788000 }, { "epoch": 3.7438875305623474, "grad_norm": 14.659960746765137, "learning_rate": 3.1346190152544936e-05, "loss": 2.5805, "step": 1788500 }, { "epoch": 3.744934186287973, "grad_norm": 12.604025840759277, "learning_rate": 3.134094589605464e-05, "loss": 2.5745, "step": 1789000 }, { "epoch": 3.7459808420135983, "grad_norm": 14.754315376281738, "learning_rate": 3.133570163956435e-05, "loss": 2.5838, "step": 1789500 }, { "epoch": 3.7470274977392237, "grad_norm": 13.441598892211914, "learning_rate": 3.1330457383074055e-05, "loss": 2.5893, "step": 1790000 }, { "epoch": 3.748074153464849, "grad_norm": 15.267439842224121, "learning_rate": 3.1325213126583767e-05, "loss": 2.5632, "step": 1790500 }, { "epoch": 3.7491208091904746, "grad_norm": 15.258098602294922, "learning_rate": 3.131996887009348e-05, "loss": 2.5715, "step": 1791000 }, { "epoch": 3.7501674649161, "grad_norm": 13.56146240234375, "learning_rate": 3.131472461360319e-05, "loss": 2.5848, "step": 1791500 }, { "epoch": 3.7512141206417255, "grad_norm": 14.75948429107666, "learning_rate": 3.130948035711289e-05, "loss": 2.5801, "step": 1792000 }, { "epoch": 3.752260776367351, "grad_norm": 13.765016555786133, "learning_rate": 3.13042361006226e-05, "loss": 2.586, "step": 1792500 }, { "epoch": 3.7533074320929765, "grad_norm": 16.666038513183594, "learning_rate": 3.129899184413231e-05, "loss": 2.5909, "step": 1793000 }, { "epoch": 3.754354087818602, "grad_norm": 13.898666381835938, "learning_rate": 3.129374758764201e-05, "loss": 2.5732, "step": 1793500 }, { "epoch": 3.7554007435442274, "grad_norm": 13.220250129699707, "learning_rate": 3.1288503331151724e-05, "loss": 2.5792, "step": 1794000 }, { "epoch": 3.756447399269853, "grad_norm": 13.9594144821167, "learning_rate": 3.128325907466143e-05, "loss": 2.5852, "step": 1794500 }, { "epoch": 3.7574940549954783, "grad_norm": 14.046142578125, "learning_rate": 3.127801481817114e-05, "loss": 2.5848, "step": 1795000 }, { "epoch": 3.758540710721104, "grad_norm": 15.200285911560059, "learning_rate": 3.127277056168085e-05, "loss": 2.5719, "step": 1795500 }, { "epoch": 3.7595873664467296, "grad_norm": 17.83773422241211, "learning_rate": 3.126752630519056e-05, "loss": 2.6014, "step": 1796000 }, { "epoch": 3.760634022172355, "grad_norm": 14.58721923828125, "learning_rate": 3.1262282048700266e-05, "loss": 2.5781, "step": 1796500 }, { "epoch": 3.7616806778979806, "grad_norm": 14.298341751098633, "learning_rate": 3.125703779220998e-05, "loss": 2.5725, "step": 1797000 }, { "epoch": 3.762727333623606, "grad_norm": 14.044888496398926, "learning_rate": 3.125179353571968e-05, "loss": 2.5894, "step": 1797500 }, { "epoch": 3.7637739893492315, "grad_norm": 12.822834014892578, "learning_rate": 3.1246549279229385e-05, "loss": 2.5958, "step": 1798000 }, { "epoch": 3.764820645074857, "grad_norm": 14.339755058288574, "learning_rate": 3.1241305022739096e-05, "loss": 2.5927, "step": 1798500 }, { "epoch": 3.7658673008004824, "grad_norm": 12.486420631408691, "learning_rate": 3.12360607662488e-05, "loss": 2.5833, "step": 1799000 }, { "epoch": 3.766913956526108, "grad_norm": 12.978367805480957, "learning_rate": 3.123081650975851e-05, "loss": 2.5631, "step": 1799500 }, { "epoch": 3.7679606122517333, "grad_norm": 16.694028854370117, "learning_rate": 3.122557225326822e-05, "loss": 2.5643, "step": 1800000 }, { "epoch": 3.7690072679773587, "grad_norm": 13.198653221130371, "learning_rate": 3.1220327996777934e-05, "loss": 2.565, "step": 1800500 }, { "epoch": 3.770053923702984, "grad_norm": 22.300464630126953, "learning_rate": 3.121508374028764e-05, "loss": 2.5628, "step": 1801000 }, { "epoch": 3.7711005794286097, "grad_norm": 13.531105995178223, "learning_rate": 3.120983948379735e-05, "loss": 2.5803, "step": 1801500 }, { "epoch": 3.772147235154235, "grad_norm": 13.44925308227539, "learning_rate": 3.1204595227307054e-05, "loss": 2.5864, "step": 1802000 }, { "epoch": 3.7731938908798606, "grad_norm": 17.30584144592285, "learning_rate": 3.1199350970816765e-05, "loss": 2.575, "step": 1802500 }, { "epoch": 3.774240546605486, "grad_norm": 14.242551803588867, "learning_rate": 3.119410671432647e-05, "loss": 2.6035, "step": 1803000 }, { "epoch": 3.7752872023311115, "grad_norm": 12.739351272583008, "learning_rate": 3.118886245783617e-05, "loss": 2.5804, "step": 1803500 }, { "epoch": 3.776333858056737, "grad_norm": 18.375568389892578, "learning_rate": 3.1183618201345884e-05, "loss": 2.5792, "step": 1804000 }, { "epoch": 3.7773805137823624, "grad_norm": 14.002302169799805, "learning_rate": 3.1178373944855595e-05, "loss": 2.5753, "step": 1804500 }, { "epoch": 3.778427169507988, "grad_norm": 18.40602684020996, "learning_rate": 3.1173129688365307e-05, "loss": 2.5683, "step": 1805000 }, { "epoch": 3.7794738252336133, "grad_norm": 12.794807434082031, "learning_rate": 3.116788543187501e-05, "loss": 2.5768, "step": 1805500 }, { "epoch": 3.780520480959239, "grad_norm": 13.801725387573242, "learning_rate": 3.116264117538472e-05, "loss": 2.5796, "step": 1806000 }, { "epoch": 3.7815671366848647, "grad_norm": 11.902203559875488, "learning_rate": 3.1157396918894426e-05, "loss": 2.5866, "step": 1806500 }, { "epoch": 3.78261379241049, "grad_norm": 12.15571403503418, "learning_rate": 3.115215266240414e-05, "loss": 2.5823, "step": 1807000 }, { "epoch": 3.7836604481361156, "grad_norm": 13.95646858215332, "learning_rate": 3.114690840591384e-05, "loss": 2.5854, "step": 1807500 }, { "epoch": 3.784707103861741, "grad_norm": 12.286471366882324, "learning_rate": 3.114166414942355e-05, "loss": 2.5756, "step": 1808000 }, { "epoch": 3.7857537595873665, "grad_norm": 13.908307075500488, "learning_rate": 3.1136419892933264e-05, "loss": 2.5635, "step": 1808500 }, { "epoch": 3.786800415312992, "grad_norm": 14.744897842407227, "learning_rate": 3.1131175636442975e-05, "loss": 2.5859, "step": 1809000 }, { "epoch": 3.7878470710386174, "grad_norm": 12.841535568237305, "learning_rate": 3.112593137995268e-05, "loss": 2.5738, "step": 1809500 }, { "epoch": 3.788893726764243, "grad_norm": 16.013038635253906, "learning_rate": 3.1120687123462383e-05, "loss": 2.5841, "step": 1810000 }, { "epoch": 3.7899403824898683, "grad_norm": 12.099495887756348, "learning_rate": 3.1115442866972095e-05, "loss": 2.5612, "step": 1810500 }, { "epoch": 3.7909870382154938, "grad_norm": 13.891251564025879, "learning_rate": 3.11101986104818e-05, "loss": 2.5751, "step": 1811000 }, { "epoch": 3.792033693941119, "grad_norm": 14.79352855682373, "learning_rate": 3.110495435399151e-05, "loss": 2.5843, "step": 1811500 }, { "epoch": 3.7930803496667447, "grad_norm": 13.69851303100586, "learning_rate": 3.1099710097501214e-05, "loss": 2.5547, "step": 1812000 }, { "epoch": 3.79412700539237, "grad_norm": 17.023147583007812, "learning_rate": 3.1094465841010925e-05, "loss": 2.564, "step": 1812500 }, { "epoch": 3.795173661117996, "grad_norm": 15.490410804748535, "learning_rate": 3.1089221584520636e-05, "loss": 2.5922, "step": 1813000 }, { "epoch": 3.7962203168436215, "grad_norm": 12.65311050415039, "learning_rate": 3.108397732803035e-05, "loss": 2.5907, "step": 1813500 }, { "epoch": 3.797266972569247, "grad_norm": 13.272870063781738, "learning_rate": 3.107873307154005e-05, "loss": 2.5813, "step": 1814000 }, { "epoch": 3.7983136282948724, "grad_norm": 13.079813957214355, "learning_rate": 3.107348881504976e-05, "loss": 2.5799, "step": 1814500 }, { "epoch": 3.799360284020498, "grad_norm": 14.966517448425293, "learning_rate": 3.106824455855947e-05, "loss": 2.5718, "step": 1815000 }, { "epoch": 3.8004069397461233, "grad_norm": 13.341015815734863, "learning_rate": 3.106300030206917e-05, "loss": 2.5812, "step": 1815500 }, { "epoch": 3.8014535954717488, "grad_norm": 13.877593994140625, "learning_rate": 3.105775604557888e-05, "loss": 2.5601, "step": 1816000 }, { "epoch": 3.8025002511973742, "grad_norm": 14.528478622436523, "learning_rate": 3.105251178908859e-05, "loss": 2.5708, "step": 1816500 }, { "epoch": 3.8035469069229997, "grad_norm": 14.694356918334961, "learning_rate": 3.10472675325983e-05, "loss": 2.6003, "step": 1817000 }, { "epoch": 3.804593562648625, "grad_norm": 16.10995101928711, "learning_rate": 3.104202327610801e-05, "loss": 2.5762, "step": 1817500 }, { "epoch": 3.8056402183742506, "grad_norm": 15.117996215820312, "learning_rate": 3.103677901961772e-05, "loss": 2.5612, "step": 1818000 }, { "epoch": 3.806686874099876, "grad_norm": 15.624755859375, "learning_rate": 3.1031534763127424e-05, "loss": 2.5826, "step": 1818500 }, { "epoch": 3.8077335298255015, "grad_norm": 14.369009971618652, "learning_rate": 3.1026290506637135e-05, "loss": 2.5779, "step": 1819000 }, { "epoch": 3.808780185551127, "grad_norm": 13.83758544921875, "learning_rate": 3.102104625014684e-05, "loss": 2.5622, "step": 1819500 }, { "epoch": 3.8098268412767524, "grad_norm": 13.105052947998047, "learning_rate": 3.101580199365655e-05, "loss": 2.5706, "step": 1820000 }, { "epoch": 3.810873497002378, "grad_norm": 14.361424446105957, "learning_rate": 3.1010557737166255e-05, "loss": 2.585, "step": 1820500 }, { "epoch": 3.8119201527280033, "grad_norm": 14.792586326599121, "learning_rate": 3.100531348067596e-05, "loss": 2.5631, "step": 1821000 }, { "epoch": 3.812966808453629, "grad_norm": 12.61376667022705, "learning_rate": 3.100006922418567e-05, "loss": 2.5772, "step": 1821500 }, { "epoch": 3.8140134641792542, "grad_norm": 11.806755065917969, "learning_rate": 3.099482496769538e-05, "loss": 2.5797, "step": 1822000 }, { "epoch": 3.8150601199048797, "grad_norm": 13.803109169006348, "learning_rate": 3.098958071120509e-05, "loss": 2.5818, "step": 1822500 }, { "epoch": 3.816106775630505, "grad_norm": 15.367480278015137, "learning_rate": 3.09843364547148e-05, "loss": 2.5746, "step": 1823000 }, { "epoch": 3.817153431356131, "grad_norm": 13.832910537719727, "learning_rate": 3.097909219822451e-05, "loss": 2.5889, "step": 1823500 }, { "epoch": 3.8182000870817565, "grad_norm": 14.035930633544922, "learning_rate": 3.097384794173421e-05, "loss": 2.5883, "step": 1824000 }, { "epoch": 3.819246742807382, "grad_norm": 14.116829872131348, "learning_rate": 3.0968603685243923e-05, "loss": 2.5695, "step": 1824500 }, { "epoch": 3.8202933985330074, "grad_norm": 13.723298072814941, "learning_rate": 3.096335942875363e-05, "loss": 2.5634, "step": 1825000 }, { "epoch": 3.821340054258633, "grad_norm": 15.051197052001953, "learning_rate": 3.095811517226334e-05, "loss": 2.5783, "step": 1825500 }, { "epoch": 3.8223867099842583, "grad_norm": 15.068758010864258, "learning_rate": 3.095287091577305e-05, "loss": 2.5905, "step": 1826000 }, { "epoch": 3.823433365709884, "grad_norm": 15.70599365234375, "learning_rate": 3.0947626659282754e-05, "loss": 2.5769, "step": 1826500 }, { "epoch": 3.8244800214355092, "grad_norm": 13.127647399902344, "learning_rate": 3.0942382402792465e-05, "loss": 2.5827, "step": 1827000 }, { "epoch": 3.8255266771611347, "grad_norm": 14.364226341247559, "learning_rate": 3.093713814630217e-05, "loss": 2.5774, "step": 1827500 }, { "epoch": 3.82657333288676, "grad_norm": 17.368234634399414, "learning_rate": 3.093189388981188e-05, "loss": 2.5721, "step": 1828000 }, { "epoch": 3.8276199886123856, "grad_norm": 13.718436241149902, "learning_rate": 3.0926649633321585e-05, "loss": 2.5727, "step": 1828500 }, { "epoch": 3.828666644338011, "grad_norm": 14.418340682983398, "learning_rate": 3.0921405376831296e-05, "loss": 2.5562, "step": 1829000 }, { "epoch": 3.8297133000636365, "grad_norm": 13.796804428100586, "learning_rate": 3.0916161120341e-05, "loss": 2.5648, "step": 1829500 }, { "epoch": 3.830759955789262, "grad_norm": 14.71792984008789, "learning_rate": 3.091091686385071e-05, "loss": 2.5605, "step": 1830000 }, { "epoch": 3.831806611514888, "grad_norm": 14.77296257019043, "learning_rate": 3.090567260736042e-05, "loss": 2.5578, "step": 1830500 }, { "epoch": 3.8328532672405133, "grad_norm": 16.658859252929688, "learning_rate": 3.0900428350870134e-05, "loss": 2.5705, "step": 1831000 }, { "epoch": 3.833899922966139, "grad_norm": 14.015901565551758, "learning_rate": 3.089518409437984e-05, "loss": 2.5593, "step": 1831500 }, { "epoch": 3.8349465786917643, "grad_norm": 16.40154266357422, "learning_rate": 3.088993983788954e-05, "loss": 2.5944, "step": 1832000 }, { "epoch": 3.8359932344173897, "grad_norm": 14.628334999084473, "learning_rate": 3.088469558139925e-05, "loss": 2.5889, "step": 1832500 }, { "epoch": 3.837039890143015, "grad_norm": 14.585469245910645, "learning_rate": 3.087945132490896e-05, "loss": 2.5782, "step": 1833000 }, { "epoch": 3.8380865458686406, "grad_norm": 14.11220932006836, "learning_rate": 3.087420706841867e-05, "loss": 2.563, "step": 1833500 }, { "epoch": 3.839133201594266, "grad_norm": 15.2371826171875, "learning_rate": 3.086896281192837e-05, "loss": 2.5382, "step": 1834000 }, { "epoch": 3.8401798573198915, "grad_norm": 15.092574119567871, "learning_rate": 3.0863718555438084e-05, "loss": 2.5603, "step": 1834500 }, { "epoch": 3.841226513045517, "grad_norm": 12.390212059020996, "learning_rate": 3.0858474298947795e-05, "loss": 2.5681, "step": 1835000 }, { "epoch": 3.8422731687711424, "grad_norm": 17.030925750732422, "learning_rate": 3.0853230042457506e-05, "loss": 2.5744, "step": 1835500 }, { "epoch": 3.843319824496768, "grad_norm": 13.650412559509277, "learning_rate": 3.084798578596721e-05, "loss": 2.5491, "step": 1836000 }, { "epoch": 3.8443664802223934, "grad_norm": 14.734782218933105, "learning_rate": 3.084274152947692e-05, "loss": 2.5641, "step": 1836500 }, { "epoch": 3.845413135948019, "grad_norm": 12.855195045471191, "learning_rate": 3.0837497272986626e-05, "loss": 2.5847, "step": 1837000 }, { "epoch": 3.8464597916736443, "grad_norm": 13.138155937194824, "learning_rate": 3.083225301649634e-05, "loss": 2.5874, "step": 1837500 }, { "epoch": 3.8475064473992697, "grad_norm": 14.471920013427734, "learning_rate": 3.082700876000604e-05, "loss": 2.5635, "step": 1838000 }, { "epoch": 3.848553103124895, "grad_norm": 14.24730110168457, "learning_rate": 3.0821764503515746e-05, "loss": 2.5629, "step": 1838500 }, { "epoch": 3.8495997588505206, "grad_norm": 13.99874210357666, "learning_rate": 3.081652024702546e-05, "loss": 2.5617, "step": 1839000 }, { "epoch": 3.850646414576146, "grad_norm": 14.3945894241333, "learning_rate": 3.081127599053517e-05, "loss": 2.5709, "step": 1839500 }, { "epoch": 3.8516930703017715, "grad_norm": 14.295486450195312, "learning_rate": 3.080603173404488e-05, "loss": 2.5698, "step": 1840000 }, { "epoch": 3.852739726027397, "grad_norm": 13.903300285339355, "learning_rate": 3.080078747755458e-05, "loss": 2.5744, "step": 1840500 }, { "epoch": 3.853786381753023, "grad_norm": 14.783059120178223, "learning_rate": 3.0795543221064294e-05, "loss": 2.5555, "step": 1841000 }, { "epoch": 3.8548330374786484, "grad_norm": 12.889102935791016, "learning_rate": 3.0790298964574e-05, "loss": 2.5629, "step": 1841500 }, { "epoch": 3.855879693204274, "grad_norm": 14.94154167175293, "learning_rate": 3.078505470808371e-05, "loss": 2.5783, "step": 1842000 }, { "epoch": 3.8569263489298993, "grad_norm": 12.05952262878418, "learning_rate": 3.0779810451593414e-05, "loss": 2.569, "step": 1842500 }, { "epoch": 3.8579730046555247, "grad_norm": 17.157005310058594, "learning_rate": 3.0774566195103125e-05, "loss": 2.5644, "step": 1843000 }, { "epoch": 3.85901966038115, "grad_norm": 13.757843017578125, "learning_rate": 3.0769321938612836e-05, "loss": 2.5653, "step": 1843500 }, { "epoch": 3.8600663161067756, "grad_norm": 15.185547828674316, "learning_rate": 3.076407768212254e-05, "loss": 2.5818, "step": 1844000 }, { "epoch": 3.861112971832401, "grad_norm": 15.84499454498291, "learning_rate": 3.075883342563225e-05, "loss": 2.5887, "step": 1844500 }, { "epoch": 3.8621596275580266, "grad_norm": 14.3101224899292, "learning_rate": 3.0753589169141956e-05, "loss": 2.5761, "step": 1845000 }, { "epoch": 3.863206283283652, "grad_norm": 12.754878044128418, "learning_rate": 3.074834491265167e-05, "loss": 2.564, "step": 1845500 }, { "epoch": 3.8642529390092775, "grad_norm": 15.185264587402344, "learning_rate": 3.074310065616137e-05, "loss": 2.5751, "step": 1846000 }, { "epoch": 3.865299594734903, "grad_norm": 14.230659484863281, "learning_rate": 3.073785639967108e-05, "loss": 2.5693, "step": 1846500 }, { "epoch": 3.8663462504605284, "grad_norm": 13.982437133789062, "learning_rate": 3.0732612143180786e-05, "loss": 2.5481, "step": 1847000 }, { "epoch": 3.8673929061861543, "grad_norm": 13.18350601196289, "learning_rate": 3.07273678866905e-05, "loss": 2.594, "step": 1847500 }, { "epoch": 3.8684395619117797, "grad_norm": 16.57744598388672, "learning_rate": 3.072212363020021e-05, "loss": 2.5763, "step": 1848000 }, { "epoch": 3.869486217637405, "grad_norm": 14.537352561950684, "learning_rate": 3.071687937370992e-05, "loss": 2.5592, "step": 1848500 }, { "epoch": 3.8705328733630306, "grad_norm": 14.170025825500488, "learning_rate": 3.0711635117219624e-05, "loss": 2.5812, "step": 1849000 }, { "epoch": 3.871579529088656, "grad_norm": 13.364374160766602, "learning_rate": 3.070639086072933e-05, "loss": 2.566, "step": 1849500 }, { "epoch": 3.8726261848142816, "grad_norm": 14.01558780670166, "learning_rate": 3.070114660423904e-05, "loss": 2.5824, "step": 1850000 }, { "epoch": 3.873672840539907, "grad_norm": 12.588175773620605, "learning_rate": 3.0695902347748744e-05, "loss": 2.562, "step": 1850500 }, { "epoch": 3.8747194962655325, "grad_norm": 15.891342163085938, "learning_rate": 3.0690658091258455e-05, "loss": 2.5758, "step": 1851000 }, { "epoch": 3.875766151991158, "grad_norm": 16.371442794799805, "learning_rate": 3.068541383476816e-05, "loss": 2.5595, "step": 1851500 }, { "epoch": 3.8768128077167834, "grad_norm": 13.506487846374512, "learning_rate": 3.068016957827787e-05, "loss": 2.5833, "step": 1852000 }, { "epoch": 3.877859463442409, "grad_norm": 13.621278762817383, "learning_rate": 3.067492532178758e-05, "loss": 2.5709, "step": 1852500 }, { "epoch": 3.8789061191680343, "grad_norm": 14.383201599121094, "learning_rate": 3.066968106529729e-05, "loss": 2.6081, "step": 1853000 }, { "epoch": 3.8799527748936598, "grad_norm": 16.826770782470703, "learning_rate": 3.0664436808807e-05, "loss": 2.5759, "step": 1853500 }, { "epoch": 3.880999430619285, "grad_norm": 15.909746170043945, "learning_rate": 3.065919255231671e-05, "loss": 2.5741, "step": 1854000 }, { "epoch": 3.8820460863449107, "grad_norm": 15.402275085449219, "learning_rate": 3.065394829582641e-05, "loss": 2.5577, "step": 1854500 }, { "epoch": 3.883092742070536, "grad_norm": 15.96881103515625, "learning_rate": 3.0648704039336116e-05, "loss": 2.5564, "step": 1855000 }, { "epoch": 3.8841393977961616, "grad_norm": 13.403229713439941, "learning_rate": 3.064345978284583e-05, "loss": 2.571, "step": 1855500 }, { "epoch": 3.885186053521787, "grad_norm": 11.897123336791992, "learning_rate": 3.063821552635553e-05, "loss": 2.5568, "step": 1856000 }, { "epoch": 3.8862327092474125, "grad_norm": 11.963269233703613, "learning_rate": 3.063297126986524e-05, "loss": 2.5686, "step": 1856500 }, { "epoch": 3.887279364973038, "grad_norm": 15.632116317749023, "learning_rate": 3.0627727013374954e-05, "loss": 2.5855, "step": 1857000 }, { "epoch": 3.8883260206986634, "grad_norm": 15.553099632263184, "learning_rate": 3.0622482756884665e-05, "loss": 2.5883, "step": 1857500 }, { "epoch": 3.889372676424289, "grad_norm": 14.684657096862793, "learning_rate": 3.061723850039437e-05, "loss": 2.5819, "step": 1858000 }, { "epoch": 3.8904193321499148, "grad_norm": 13.564104080200195, "learning_rate": 3.061199424390408e-05, "loss": 2.5651, "step": 1858500 }, { "epoch": 3.89146598787554, "grad_norm": 15.006824493408203, "learning_rate": 3.0606749987413785e-05, "loss": 2.5639, "step": 1859000 }, { "epoch": 3.8925126436011657, "grad_norm": 15.916449546813965, "learning_rate": 3.0601505730923496e-05, "loss": 2.5699, "step": 1859500 }, { "epoch": 3.893559299326791, "grad_norm": 19.542282104492188, "learning_rate": 3.05962614744332e-05, "loss": 2.5951, "step": 1860000 }, { "epoch": 3.8946059550524166, "grad_norm": 13.313820838928223, "learning_rate": 3.059101721794291e-05, "loss": 2.575, "step": 1860500 }, { "epoch": 3.895652610778042, "grad_norm": 14.443830490112305, "learning_rate": 3.058577296145262e-05, "loss": 2.5737, "step": 1861000 }, { "epoch": 3.8966992665036675, "grad_norm": 14.463678359985352, "learning_rate": 3.0580528704962326e-05, "loss": 2.5564, "step": 1861500 }, { "epoch": 3.897745922229293, "grad_norm": 15.266168594360352, "learning_rate": 3.057528444847204e-05, "loss": 2.5856, "step": 1862000 }, { "epoch": 3.8987925779549184, "grad_norm": 14.113180160522461, "learning_rate": 3.057004019198174e-05, "loss": 2.5628, "step": 1862500 }, { "epoch": 3.899839233680544, "grad_norm": 14.859914779663086, "learning_rate": 3.056479593549145e-05, "loss": 2.5816, "step": 1863000 }, { "epoch": 3.9008858894061693, "grad_norm": 13.078120231628418, "learning_rate": 3.055955167900116e-05, "loss": 2.5471, "step": 1863500 }, { "epoch": 3.9019325451317948, "grad_norm": 13.835451126098633, "learning_rate": 3.055430742251087e-05, "loss": 2.5444, "step": 1864000 }, { "epoch": 3.9029792008574202, "grad_norm": 13.755539894104004, "learning_rate": 3.054906316602057e-05, "loss": 2.5813, "step": 1864500 }, { "epoch": 3.904025856583046, "grad_norm": 15.79088020324707, "learning_rate": 3.0543818909530284e-05, "loss": 2.5576, "step": 1865000 }, { "epoch": 3.9050725123086716, "grad_norm": 20.053871154785156, "learning_rate": 3.0538574653039995e-05, "loss": 2.5515, "step": 1865500 }, { "epoch": 3.906119168034297, "grad_norm": 11.744597434997559, "learning_rate": 3.05333303965497e-05, "loss": 2.5627, "step": 1866000 }, { "epoch": 3.9071658237599225, "grad_norm": 14.796585083007812, "learning_rate": 3.052808614005941e-05, "loss": 2.5927, "step": 1866500 }, { "epoch": 3.908212479485548, "grad_norm": 14.681218147277832, "learning_rate": 3.0522841883569114e-05, "loss": 2.5736, "step": 1867000 }, { "epoch": 3.9092591352111734, "grad_norm": 14.663121223449707, "learning_rate": 3.0517597627078826e-05, "loss": 2.5448, "step": 1867500 }, { "epoch": 3.910305790936799, "grad_norm": 14.429346084594727, "learning_rate": 3.051235337058853e-05, "loss": 2.5574, "step": 1868000 }, { "epoch": 3.9113524466624243, "grad_norm": 15.156332969665527, "learning_rate": 3.050710911409824e-05, "loss": 2.5632, "step": 1868500 }, { "epoch": 3.9123991023880498, "grad_norm": 13.329412460327148, "learning_rate": 3.050186485760795e-05, "loss": 2.5496, "step": 1869000 }, { "epoch": 3.9134457581136752, "grad_norm": 18.92348289489746, "learning_rate": 3.049662060111766e-05, "loss": 2.565, "step": 1869500 }, { "epoch": 3.9144924138393007, "grad_norm": 13.102716445922852, "learning_rate": 3.0491376344627364e-05, "loss": 2.5437, "step": 1870000 }, { "epoch": 3.915539069564926, "grad_norm": 15.183428764343262, "learning_rate": 3.0486132088137075e-05, "loss": 2.5886, "step": 1870500 }, { "epoch": 3.9165857252905516, "grad_norm": 13.478803634643555, "learning_rate": 3.0480887831646783e-05, "loss": 2.5584, "step": 1871000 }, { "epoch": 3.917632381016177, "grad_norm": 13.450512886047363, "learning_rate": 3.0475643575156494e-05, "loss": 2.5707, "step": 1871500 }, { "epoch": 3.9186790367418025, "grad_norm": 15.491972923278809, "learning_rate": 3.0470399318666198e-05, "loss": 2.5813, "step": 1872000 }, { "epoch": 3.919725692467428, "grad_norm": 14.588622093200684, "learning_rate": 3.0465155062175902e-05, "loss": 2.5624, "step": 1872500 }, { "epoch": 3.9207723481930534, "grad_norm": 13.68448257446289, "learning_rate": 3.0459910805685617e-05, "loss": 2.5633, "step": 1873000 }, { "epoch": 3.921819003918679, "grad_norm": 13.758883476257324, "learning_rate": 3.045466654919532e-05, "loss": 2.561, "step": 1873500 }, { "epoch": 3.9228656596443043, "grad_norm": 13.631280899047852, "learning_rate": 3.0449422292705032e-05, "loss": 2.5651, "step": 1874000 }, { "epoch": 3.92391231536993, "grad_norm": 13.136406898498535, "learning_rate": 3.0444178036214737e-05, "loss": 2.5649, "step": 1874500 }, { "epoch": 3.9249589710955552, "grad_norm": 16.68004035949707, "learning_rate": 3.0438933779724448e-05, "loss": 2.5531, "step": 1875000 }, { "epoch": 3.9260056268211807, "grad_norm": 13.759825706481934, "learning_rate": 3.0433689523234155e-05, "loss": 2.5576, "step": 1875500 }, { "epoch": 3.9270522825468066, "grad_norm": 14.519694328308105, "learning_rate": 3.0428445266743866e-05, "loss": 2.5734, "step": 1876000 }, { "epoch": 3.928098938272432, "grad_norm": 14.728185653686523, "learning_rate": 3.042320101025357e-05, "loss": 2.5574, "step": 1876500 }, { "epoch": 3.9291455939980575, "grad_norm": 12.70145320892334, "learning_rate": 3.0417956753763282e-05, "loss": 2.5682, "step": 1877000 }, { "epoch": 3.930192249723683, "grad_norm": 15.485761642456055, "learning_rate": 3.041271249727299e-05, "loss": 2.5711, "step": 1877500 }, { "epoch": 3.9312389054493084, "grad_norm": 13.21676254272461, "learning_rate": 3.0407468240782694e-05, "loss": 2.5545, "step": 1878000 }, { "epoch": 3.932285561174934, "grad_norm": 15.395923614501953, "learning_rate": 3.0402223984292405e-05, "loss": 2.5555, "step": 1878500 }, { "epoch": 3.9333322169005593, "grad_norm": 16.422935485839844, "learning_rate": 3.039697972780211e-05, "loss": 2.5588, "step": 1879000 }, { "epoch": 3.934378872626185, "grad_norm": 16.57903480529785, "learning_rate": 3.039173547131182e-05, "loss": 2.5424, "step": 1879500 }, { "epoch": 3.9354255283518103, "grad_norm": 14.439245223999023, "learning_rate": 3.0386491214821528e-05, "loss": 2.5561, "step": 1880000 }, { "epoch": 3.9364721840774357, "grad_norm": 13.147229194641113, "learning_rate": 3.038124695833124e-05, "loss": 2.5827, "step": 1880500 }, { "epoch": 3.937518839803061, "grad_norm": 13.666015625, "learning_rate": 3.0376002701840943e-05, "loss": 2.5663, "step": 1881000 }, { "epoch": 3.9385654955286866, "grad_norm": 18.18890953063965, "learning_rate": 3.0370758445350654e-05, "loss": 2.5883, "step": 1881500 }, { "epoch": 3.939612151254312, "grad_norm": 14.04914379119873, "learning_rate": 3.0365514188860362e-05, "loss": 2.5599, "step": 1882000 }, { "epoch": 3.940658806979938, "grad_norm": 13.193723678588867, "learning_rate": 3.0360269932370073e-05, "loss": 2.5763, "step": 1882500 }, { "epoch": 3.9417054627055634, "grad_norm": 12.702287673950195, "learning_rate": 3.0355025675879778e-05, "loss": 2.5647, "step": 1883000 }, { "epoch": 3.942752118431189, "grad_norm": 11.549362182617188, "learning_rate": 3.0349781419389485e-05, "loss": 2.567, "step": 1883500 }, { "epoch": 3.9437987741568143, "grad_norm": 15.743208885192871, "learning_rate": 3.0344537162899196e-05, "loss": 2.5651, "step": 1884000 }, { "epoch": 3.94484542988244, "grad_norm": 14.692502975463867, "learning_rate": 3.03392929064089e-05, "loss": 2.5594, "step": 1884500 }, { "epoch": 3.9458920856080653, "grad_norm": 16.040321350097656, "learning_rate": 3.033404864991861e-05, "loss": 2.5903, "step": 1885000 }, { "epoch": 3.9469387413336907, "grad_norm": 17.812427520751953, "learning_rate": 3.0328804393428316e-05, "loss": 2.571, "step": 1885500 }, { "epoch": 3.947985397059316, "grad_norm": 12.836655616760254, "learning_rate": 3.0323560136938027e-05, "loss": 2.5735, "step": 1886000 }, { "epoch": 3.9490320527849416, "grad_norm": 13.276121139526367, "learning_rate": 3.0318315880447735e-05, "loss": 2.5764, "step": 1886500 }, { "epoch": 3.950078708510567, "grad_norm": 17.80423355102539, "learning_rate": 3.0313071623957446e-05, "loss": 2.5618, "step": 1887000 }, { "epoch": 3.9511253642361925, "grad_norm": 16.486583709716797, "learning_rate": 3.030782736746715e-05, "loss": 2.5616, "step": 1887500 }, { "epoch": 3.952172019961818, "grad_norm": 14.536334991455078, "learning_rate": 3.030258311097686e-05, "loss": 2.563, "step": 1888000 }, { "epoch": 3.9532186756874435, "grad_norm": 15.322723388671875, "learning_rate": 3.029733885448657e-05, "loss": 2.5671, "step": 1888500 }, { "epoch": 3.954265331413069, "grad_norm": 15.143549919128418, "learning_rate": 3.0292094597996273e-05, "loss": 2.5654, "step": 1889000 }, { "epoch": 3.9553119871386944, "grad_norm": 13.295762062072754, "learning_rate": 3.0286850341505984e-05, "loss": 2.57, "step": 1889500 }, { "epoch": 3.95635864286432, "grad_norm": 15.029439926147461, "learning_rate": 3.028160608501569e-05, "loss": 2.5736, "step": 1890000 }, { "epoch": 3.9574052985899453, "grad_norm": 12.944684028625488, "learning_rate": 3.0276361828525403e-05, "loss": 2.5662, "step": 1890500 }, { "epoch": 3.9584519543155707, "grad_norm": 14.184069633483887, "learning_rate": 3.0271117572035107e-05, "loss": 2.5597, "step": 1891000 }, { "epoch": 3.959498610041196, "grad_norm": 18.7308406829834, "learning_rate": 3.026587331554482e-05, "loss": 2.5627, "step": 1891500 }, { "epoch": 3.9605452657668216, "grad_norm": 14.322811126708984, "learning_rate": 3.0260629059054523e-05, "loss": 2.5766, "step": 1892000 }, { "epoch": 3.961591921492447, "grad_norm": 14.037549018859863, "learning_rate": 3.0255384802564234e-05, "loss": 2.5512, "step": 1892500 }, { "epoch": 3.9626385772180726, "grad_norm": 13.006865501403809, "learning_rate": 3.025014054607394e-05, "loss": 2.5635, "step": 1893000 }, { "epoch": 3.9636852329436985, "grad_norm": 13.22152042388916, "learning_rate": 3.0244896289583653e-05, "loss": 2.568, "step": 1893500 }, { "epoch": 3.964731888669324, "grad_norm": 12.936583518981934, "learning_rate": 3.0239652033093357e-05, "loss": 2.5757, "step": 1894000 }, { "epoch": 3.9657785443949494, "grad_norm": 13.400964736938477, "learning_rate": 3.0234407776603065e-05, "loss": 2.56, "step": 1894500 }, { "epoch": 3.966825200120575, "grad_norm": 16.58565330505371, "learning_rate": 3.0229163520112776e-05, "loss": 2.5716, "step": 1895000 }, { "epoch": 3.9678718558462003, "grad_norm": 16.198041915893555, "learning_rate": 3.022391926362248e-05, "loss": 2.5564, "step": 1895500 }, { "epoch": 3.9689185115718257, "grad_norm": 13.081537246704102, "learning_rate": 3.021867500713219e-05, "loss": 2.5696, "step": 1896000 }, { "epoch": 3.969965167297451, "grad_norm": 15.092610359191895, "learning_rate": 3.0213430750641895e-05, "loss": 2.5797, "step": 1896500 }, { "epoch": 3.9710118230230766, "grad_norm": 14.8792142868042, "learning_rate": 3.0208186494151606e-05, "loss": 2.5778, "step": 1897000 }, { "epoch": 3.972058478748702, "grad_norm": 16.871110916137695, "learning_rate": 3.0202942237661314e-05, "loss": 2.5455, "step": 1897500 }, { "epoch": 3.9731051344743276, "grad_norm": 13.74767017364502, "learning_rate": 3.0197697981171025e-05, "loss": 2.5596, "step": 1898000 }, { "epoch": 3.974151790199953, "grad_norm": 18.20973014831543, "learning_rate": 3.019245372468073e-05, "loss": 2.5654, "step": 1898500 }, { "epoch": 3.9751984459255785, "grad_norm": 16.465709686279297, "learning_rate": 3.018720946819044e-05, "loss": 2.5602, "step": 1899000 }, { "epoch": 3.976245101651204, "grad_norm": 15.554404258728027, "learning_rate": 3.0181965211700148e-05, "loss": 2.5747, "step": 1899500 }, { "epoch": 3.97729175737683, "grad_norm": 14.820489883422852, "learning_rate": 3.0176720955209853e-05, "loss": 2.5728, "step": 1900000 }, { "epoch": 3.9783384131024553, "grad_norm": 15.179065704345703, "learning_rate": 3.0171476698719564e-05, "loss": 2.5682, "step": 1900500 }, { "epoch": 3.9793850688280807, "grad_norm": 17.491098403930664, "learning_rate": 3.016623244222927e-05, "loss": 2.5508, "step": 1901000 }, { "epoch": 3.980431724553706, "grad_norm": 17.86079216003418, "learning_rate": 3.0160988185738982e-05, "loss": 2.542, "step": 1901500 }, { "epoch": 3.9814783802793317, "grad_norm": 14.000906944274902, "learning_rate": 3.0155743929248687e-05, "loss": 2.5682, "step": 1902000 }, { "epoch": 3.982525036004957, "grad_norm": 14.653892517089844, "learning_rate": 3.0150499672758398e-05, "loss": 2.5514, "step": 1902500 }, { "epoch": 3.9835716917305826, "grad_norm": 15.215910911560059, "learning_rate": 3.0145255416268102e-05, "loss": 2.5654, "step": 1903000 }, { "epoch": 3.984618347456208, "grad_norm": 15.55965805053711, "learning_rate": 3.0140011159777813e-05, "loss": 2.5662, "step": 1903500 }, { "epoch": 3.9856650031818335, "grad_norm": 15.004844665527344, "learning_rate": 3.013476690328752e-05, "loss": 2.5584, "step": 1904000 }, { "epoch": 3.986711658907459, "grad_norm": 16.002132415771484, "learning_rate": 3.0129522646797232e-05, "loss": 2.5648, "step": 1904500 }, { "epoch": 3.9877583146330844, "grad_norm": 14.431142807006836, "learning_rate": 3.0124278390306936e-05, "loss": 2.5697, "step": 1905000 }, { "epoch": 3.98880497035871, "grad_norm": 13.652318954467773, "learning_rate": 3.0119034133816647e-05, "loss": 2.5731, "step": 1905500 }, { "epoch": 3.9898516260843353, "grad_norm": 15.755651473999023, "learning_rate": 3.0113789877326355e-05, "loss": 2.5691, "step": 1906000 }, { "epoch": 3.9908982818099608, "grad_norm": 14.002219200134277, "learning_rate": 3.010854562083606e-05, "loss": 2.5676, "step": 1906500 }, { "epoch": 3.991944937535586, "grad_norm": 16.09536361694336, "learning_rate": 3.010330136434577e-05, "loss": 2.5768, "step": 1907000 }, { "epoch": 3.9929915932612117, "grad_norm": 12.884222030639648, "learning_rate": 3.0098057107855475e-05, "loss": 2.5644, "step": 1907500 }, { "epoch": 3.994038248986837, "grad_norm": 12.633593559265137, "learning_rate": 3.0092812851365186e-05, "loss": 2.5603, "step": 1908000 }, { "epoch": 3.9950849047124626, "grad_norm": 15.874398231506348, "learning_rate": 3.0087568594874893e-05, "loss": 2.5605, "step": 1908500 }, { "epoch": 3.996131560438088, "grad_norm": 14.08243179321289, "learning_rate": 3.0082324338384605e-05, "loss": 2.5521, "step": 1909000 }, { "epoch": 3.9971782161637135, "grad_norm": 17.140304565429688, "learning_rate": 3.007708008189431e-05, "loss": 2.5778, "step": 1909500 }, { "epoch": 3.998224871889339, "grad_norm": 12.803540229797363, "learning_rate": 3.007183582540402e-05, "loss": 2.5594, "step": 1910000 }, { "epoch": 3.9992715276149644, "grad_norm": 13.94594955444336, "learning_rate": 3.0066591568913728e-05, "loss": 2.5619, "step": 1910500 }, { "epoch": 4.00031818334059, "grad_norm": 22.57588005065918, "learning_rate": 3.006134731242344e-05, "loss": 2.5474, "step": 1911000 }, { "epoch": 4.001364839066215, "grad_norm": 13.487364768981934, "learning_rate": 3.0056103055933143e-05, "loss": 2.5513, "step": 1911500 }, { "epoch": 4.002411494791841, "grad_norm": 13.954955101013184, "learning_rate": 3.005085879944285e-05, "loss": 2.5534, "step": 1912000 }, { "epoch": 4.003458150517466, "grad_norm": 15.14806079864502, "learning_rate": 3.0045614542952562e-05, "loss": 2.5446, "step": 1912500 }, { "epoch": 4.004504806243092, "grad_norm": 12.924785614013672, "learning_rate": 3.0040370286462266e-05, "loss": 2.5547, "step": 1913000 }, { "epoch": 4.005551461968717, "grad_norm": 15.267414093017578, "learning_rate": 3.0035126029971977e-05, "loss": 2.5548, "step": 1913500 }, { "epoch": 4.006598117694343, "grad_norm": 13.600030899047852, "learning_rate": 3.002988177348168e-05, "loss": 2.5522, "step": 1914000 }, { "epoch": 4.007644773419969, "grad_norm": 15.636792182922363, "learning_rate": 3.0024637516991393e-05, "loss": 2.5653, "step": 1914500 }, { "epoch": 4.008691429145594, "grad_norm": 16.0795955657959, "learning_rate": 3.00193932605011e-05, "loss": 2.5644, "step": 1915000 }, { "epoch": 4.00973808487122, "grad_norm": 15.875439643859863, "learning_rate": 3.001414900401081e-05, "loss": 2.5768, "step": 1915500 }, { "epoch": 4.010784740596845, "grad_norm": 14.7783203125, "learning_rate": 3.0008904747520516e-05, "loss": 2.5648, "step": 1916000 }, { "epoch": 4.011831396322471, "grad_norm": 14.082164764404297, "learning_rate": 3.0003660491030227e-05, "loss": 2.5526, "step": 1916500 }, { "epoch": 4.012878052048096, "grad_norm": 16.831636428833008, "learning_rate": 2.9998416234539934e-05, "loss": 2.5608, "step": 1917000 }, { "epoch": 4.013924707773722, "grad_norm": 12.65880298614502, "learning_rate": 2.999317197804964e-05, "loss": 2.5672, "step": 1917500 }, { "epoch": 4.014971363499347, "grad_norm": 13.858748435974121, "learning_rate": 2.998792772155935e-05, "loss": 2.5329, "step": 1918000 }, { "epoch": 4.016018019224973, "grad_norm": 14.751265525817871, "learning_rate": 2.9982683465069057e-05, "loss": 2.5702, "step": 1918500 }, { "epoch": 4.017064674950598, "grad_norm": 15.352621078491211, "learning_rate": 2.997743920857877e-05, "loss": 2.5535, "step": 1919000 }, { "epoch": 4.0181113306762235, "grad_norm": 13.473865509033203, "learning_rate": 2.9972194952088473e-05, "loss": 2.5678, "step": 1919500 }, { "epoch": 4.019157986401849, "grad_norm": 14.05510139465332, "learning_rate": 2.9966950695598184e-05, "loss": 2.5567, "step": 1920000 }, { "epoch": 4.020204642127474, "grad_norm": 18.12462615966797, "learning_rate": 2.9961706439107888e-05, "loss": 2.5798, "step": 1920500 }, { "epoch": 4.0212512978531, "grad_norm": 15.513616561889648, "learning_rate": 2.99564621826176e-05, "loss": 2.5635, "step": 1921000 }, { "epoch": 4.022297953578725, "grad_norm": 13.540534973144531, "learning_rate": 2.9951217926127307e-05, "loss": 2.532, "step": 1921500 }, { "epoch": 4.023344609304351, "grad_norm": 14.710307121276855, "learning_rate": 2.9945973669637018e-05, "loss": 2.5767, "step": 1922000 }, { "epoch": 4.024391265029976, "grad_norm": 12.235822677612305, "learning_rate": 2.9940729413146722e-05, "loss": 2.5432, "step": 1922500 }, { "epoch": 4.025437920755602, "grad_norm": 12.983097076416016, "learning_rate": 2.993548515665643e-05, "loss": 2.5604, "step": 1923000 }, { "epoch": 4.026484576481227, "grad_norm": 13.207657814025879, "learning_rate": 2.993024090016614e-05, "loss": 2.5526, "step": 1923500 }, { "epoch": 4.027531232206853, "grad_norm": 15.73458194732666, "learning_rate": 2.9924996643675845e-05, "loss": 2.5482, "step": 1924000 }, { "epoch": 4.028577887932478, "grad_norm": 14.495548248291016, "learning_rate": 2.9919752387185557e-05, "loss": 2.5453, "step": 1924500 }, { "epoch": 4.0296245436581035, "grad_norm": 14.758407592773438, "learning_rate": 2.991450813069526e-05, "loss": 2.5353, "step": 1925000 }, { "epoch": 4.030671199383729, "grad_norm": 13.96451187133789, "learning_rate": 2.9909263874204972e-05, "loss": 2.5656, "step": 1925500 }, { "epoch": 4.031717855109354, "grad_norm": 16.20943260192871, "learning_rate": 2.990401961771468e-05, "loss": 2.5485, "step": 1926000 }, { "epoch": 4.03276451083498, "grad_norm": 15.471434593200684, "learning_rate": 2.989877536122439e-05, "loss": 2.5707, "step": 1926500 }, { "epoch": 4.033811166560605, "grad_norm": 13.79173469543457, "learning_rate": 2.9893531104734095e-05, "loss": 2.5617, "step": 1927000 }, { "epoch": 4.034857822286231, "grad_norm": 13.200348854064941, "learning_rate": 2.9888286848243806e-05, "loss": 2.5618, "step": 1927500 }, { "epoch": 4.035904478011856, "grad_norm": 14.840755462646484, "learning_rate": 2.9883042591753514e-05, "loss": 2.5495, "step": 1928000 }, { "epoch": 4.036951133737482, "grad_norm": 15.313641548156738, "learning_rate": 2.9877798335263218e-05, "loss": 2.5644, "step": 1928500 }, { "epoch": 4.037997789463107, "grad_norm": 13.73321533203125, "learning_rate": 2.987255407877293e-05, "loss": 2.5514, "step": 1929000 }, { "epoch": 4.039044445188733, "grad_norm": 14.33504867553711, "learning_rate": 2.9867309822282637e-05, "loss": 2.5696, "step": 1929500 }, { "epoch": 4.040091100914358, "grad_norm": 13.940930366516113, "learning_rate": 2.9862065565792348e-05, "loss": 2.5551, "step": 1930000 }, { "epoch": 4.0411377566399835, "grad_norm": 15.008676528930664, "learning_rate": 2.9856821309302052e-05, "loss": 2.5644, "step": 1930500 }, { "epoch": 4.042184412365609, "grad_norm": 13.359127044677734, "learning_rate": 2.9851577052811763e-05, "loss": 2.5585, "step": 1931000 }, { "epoch": 4.0432310680912344, "grad_norm": 13.583076477050781, "learning_rate": 2.9846332796321468e-05, "loss": 2.5634, "step": 1931500 }, { "epoch": 4.044277723816861, "grad_norm": 14.519991874694824, "learning_rate": 2.984108853983118e-05, "loss": 2.5649, "step": 1932000 }, { "epoch": 4.045324379542486, "grad_norm": 13.558488845825195, "learning_rate": 2.9835844283340886e-05, "loss": 2.5463, "step": 1932500 }, { "epoch": 4.046371035268112, "grad_norm": 16.217884063720703, "learning_rate": 2.9830600026850597e-05, "loss": 2.5701, "step": 1933000 }, { "epoch": 4.047417690993737, "grad_norm": 15.56229305267334, "learning_rate": 2.9825355770360302e-05, "loss": 2.5641, "step": 1933500 }, { "epoch": 4.048464346719363, "grad_norm": 16.838932037353516, "learning_rate": 2.982011151387001e-05, "loss": 2.5545, "step": 1934000 }, { "epoch": 4.049511002444988, "grad_norm": 14.181163787841797, "learning_rate": 2.981486725737972e-05, "loss": 2.5591, "step": 1934500 }, { "epoch": 4.0505576581706135, "grad_norm": 14.424494743347168, "learning_rate": 2.9809623000889425e-05, "loss": 2.5634, "step": 1935000 }, { "epoch": 4.051604313896239, "grad_norm": 13.120125770568848, "learning_rate": 2.9804378744399136e-05, "loss": 2.545, "step": 1935500 }, { "epoch": 4.052650969621864, "grad_norm": 16.630029678344727, "learning_rate": 2.979913448790884e-05, "loss": 2.5707, "step": 1936000 }, { "epoch": 4.05369762534749, "grad_norm": 14.263908386230469, "learning_rate": 2.9793890231418555e-05, "loss": 2.5735, "step": 1936500 }, { "epoch": 4.054744281073115, "grad_norm": 15.122203826904297, "learning_rate": 2.978864597492826e-05, "loss": 2.5612, "step": 1937000 }, { "epoch": 4.055790936798741, "grad_norm": 12.910266876220703, "learning_rate": 2.978340171843797e-05, "loss": 2.5654, "step": 1937500 }, { "epoch": 4.056837592524366, "grad_norm": 15.821070671081543, "learning_rate": 2.9778157461947674e-05, "loss": 2.5378, "step": 1938000 }, { "epoch": 4.057884248249992, "grad_norm": 13.982878684997559, "learning_rate": 2.9772913205457385e-05, "loss": 2.5483, "step": 1938500 }, { "epoch": 4.058930903975617, "grad_norm": 13.684313774108887, "learning_rate": 2.9767668948967093e-05, "loss": 2.566, "step": 1939000 }, { "epoch": 4.059977559701243, "grad_norm": 18.886070251464844, "learning_rate": 2.9762424692476804e-05, "loss": 2.564, "step": 1939500 }, { "epoch": 4.061024215426868, "grad_norm": 16.311222076416016, "learning_rate": 2.975718043598651e-05, "loss": 2.5798, "step": 1940000 }, { "epoch": 4.0620708711524935, "grad_norm": 14.279061317443848, "learning_rate": 2.9751936179496216e-05, "loss": 2.5534, "step": 1940500 }, { "epoch": 4.063117526878119, "grad_norm": 15.093724250793457, "learning_rate": 2.9746691923005927e-05, "loss": 2.5461, "step": 1941000 }, { "epoch": 4.0641641826037445, "grad_norm": 15.972548484802246, "learning_rate": 2.974144766651563e-05, "loss": 2.5511, "step": 1941500 }, { "epoch": 4.06521083832937, "grad_norm": 13.711220741271973, "learning_rate": 2.9736203410025343e-05, "loss": 2.5828, "step": 1942000 }, { "epoch": 4.066257494054995, "grad_norm": 13.997346878051758, "learning_rate": 2.9730959153535047e-05, "loss": 2.5729, "step": 1942500 }, { "epoch": 4.067304149780621, "grad_norm": 13.447683334350586, "learning_rate": 2.9725714897044758e-05, "loss": 2.5498, "step": 1943000 }, { "epoch": 4.068350805506246, "grad_norm": 13.967225074768066, "learning_rate": 2.9720470640554466e-05, "loss": 2.5597, "step": 1943500 }, { "epoch": 4.069397461231872, "grad_norm": 13.068917274475098, "learning_rate": 2.9715226384064177e-05, "loss": 2.5496, "step": 1944000 }, { "epoch": 4.070444116957497, "grad_norm": 14.691420555114746, "learning_rate": 2.970998212757388e-05, "loss": 2.5547, "step": 1944500 }, { "epoch": 4.071490772683123, "grad_norm": 16.19727897644043, "learning_rate": 2.9704737871083592e-05, "loss": 2.571, "step": 1945000 }, { "epoch": 4.072537428408748, "grad_norm": 17.21096420288086, "learning_rate": 2.96994936145933e-05, "loss": 2.5556, "step": 1945500 }, { "epoch": 4.073584084134374, "grad_norm": 13.410577774047852, "learning_rate": 2.9694249358103004e-05, "loss": 2.5696, "step": 1946000 }, { "epoch": 4.074630739859999, "grad_norm": 16.982126235961914, "learning_rate": 2.9689005101612715e-05, "loss": 2.5477, "step": 1946500 }, { "epoch": 4.0756773955856245, "grad_norm": 13.241558074951172, "learning_rate": 2.9683760845122423e-05, "loss": 2.561, "step": 1947000 }, { "epoch": 4.07672405131125, "grad_norm": 14.424422264099121, "learning_rate": 2.9678516588632134e-05, "loss": 2.5664, "step": 1947500 }, { "epoch": 4.077770707036875, "grad_norm": 12.139601707458496, "learning_rate": 2.967327233214184e-05, "loss": 2.5503, "step": 1948000 }, { "epoch": 4.078817362762501, "grad_norm": 14.251872062683105, "learning_rate": 2.966802807565155e-05, "loss": 2.538, "step": 1948500 }, { "epoch": 4.079864018488127, "grad_norm": 15.60315990447998, "learning_rate": 2.9662783819161254e-05, "loss": 2.5585, "step": 1949000 }, { "epoch": 4.080910674213753, "grad_norm": 14.020166397094727, "learning_rate": 2.9657539562670965e-05, "loss": 2.5692, "step": 1949500 }, { "epoch": 4.081957329939378, "grad_norm": 15.668407440185547, "learning_rate": 2.9652295306180673e-05, "loss": 2.5439, "step": 1950000 }, { "epoch": 4.0830039856650036, "grad_norm": 14.291611671447754, "learning_rate": 2.9647051049690384e-05, "loss": 2.5736, "step": 1950500 }, { "epoch": 4.084050641390629, "grad_norm": 18.259960174560547, "learning_rate": 2.9641806793200088e-05, "loss": 2.5485, "step": 1951000 }, { "epoch": 4.0850972971162545, "grad_norm": 13.421432495117188, "learning_rate": 2.9636562536709796e-05, "loss": 2.5491, "step": 1951500 }, { "epoch": 4.08614395284188, "grad_norm": 14.069260597229004, "learning_rate": 2.9631318280219507e-05, "loss": 2.5746, "step": 1952000 }, { "epoch": 4.087190608567505, "grad_norm": 13.409321784973145, "learning_rate": 2.962607402372921e-05, "loss": 2.5516, "step": 1952500 }, { "epoch": 4.088237264293131, "grad_norm": 14.677885055541992, "learning_rate": 2.9620829767238922e-05, "loss": 2.5545, "step": 1953000 }, { "epoch": 4.089283920018756, "grad_norm": 13.407657623291016, "learning_rate": 2.9615585510748626e-05, "loss": 2.5698, "step": 1953500 }, { "epoch": 4.090330575744382, "grad_norm": 12.031242370605469, "learning_rate": 2.961034125425834e-05, "loss": 2.558, "step": 1954000 }, { "epoch": 4.091377231470007, "grad_norm": 14.156698226928711, "learning_rate": 2.9605096997768045e-05, "loss": 2.555, "step": 1954500 }, { "epoch": 4.092423887195633, "grad_norm": 13.547745704650879, "learning_rate": 2.9599852741277756e-05, "loss": 2.5715, "step": 1955000 }, { "epoch": 4.093470542921258, "grad_norm": 14.405147552490234, "learning_rate": 2.959460848478746e-05, "loss": 2.557, "step": 1955500 }, { "epoch": 4.094517198646884, "grad_norm": 14.335883140563965, "learning_rate": 2.958936422829717e-05, "loss": 2.5627, "step": 1956000 }, { "epoch": 4.095563854372509, "grad_norm": 16.442174911499023, "learning_rate": 2.958411997180688e-05, "loss": 2.5616, "step": 1956500 }, { "epoch": 4.0966105100981345, "grad_norm": 15.155664443969727, "learning_rate": 2.9578875715316584e-05, "loss": 2.5812, "step": 1957000 }, { "epoch": 4.09765716582376, "grad_norm": 14.751226425170898, "learning_rate": 2.9573631458826295e-05, "loss": 2.5471, "step": 1957500 }, { "epoch": 4.098703821549385, "grad_norm": 13.005013465881348, "learning_rate": 2.9568387202336002e-05, "loss": 2.5618, "step": 1958000 }, { "epoch": 4.099750477275011, "grad_norm": 14.154365539550781, "learning_rate": 2.9563142945845713e-05, "loss": 2.5725, "step": 1958500 }, { "epoch": 4.100797133000636, "grad_norm": 15.217844009399414, "learning_rate": 2.9557898689355418e-05, "loss": 2.5543, "step": 1959000 }, { "epoch": 4.101843788726262, "grad_norm": 15.485514640808105, "learning_rate": 2.955265443286513e-05, "loss": 2.5932, "step": 1959500 }, { "epoch": 4.102890444451887, "grad_norm": 16.700016021728516, "learning_rate": 2.9547410176374833e-05, "loss": 2.56, "step": 1960000 }, { "epoch": 4.103937100177513, "grad_norm": 14.081635475158691, "learning_rate": 2.9542165919884544e-05, "loss": 2.5593, "step": 1960500 }, { "epoch": 4.104983755903138, "grad_norm": 13.457268714904785, "learning_rate": 2.9536921663394252e-05, "loss": 2.5345, "step": 1961000 }, { "epoch": 4.106030411628764, "grad_norm": 14.9102144241333, "learning_rate": 2.9531677406903963e-05, "loss": 2.5382, "step": 1961500 }, { "epoch": 4.107077067354389, "grad_norm": 21.125694274902344, "learning_rate": 2.9526433150413667e-05, "loss": 2.5509, "step": 1962000 }, { "epoch": 4.1081237230800145, "grad_norm": 15.938827514648438, "learning_rate": 2.9521188893923375e-05, "loss": 2.5392, "step": 1962500 }, { "epoch": 4.10917037880564, "grad_norm": 15.501781463623047, "learning_rate": 2.9515944637433086e-05, "loss": 2.5494, "step": 1963000 }, { "epoch": 4.110217034531265, "grad_norm": 15.280054092407227, "learning_rate": 2.951070038094279e-05, "loss": 2.5453, "step": 1963500 }, { "epoch": 4.111263690256891, "grad_norm": 12.499234199523926, "learning_rate": 2.95054561244525e-05, "loss": 2.5552, "step": 1964000 }, { "epoch": 4.112310345982516, "grad_norm": 14.192831993103027, "learning_rate": 2.950021186796221e-05, "loss": 2.5691, "step": 1964500 }, { "epoch": 4.113357001708142, "grad_norm": 19.502662658691406, "learning_rate": 2.949496761147192e-05, "loss": 2.5419, "step": 1965000 }, { "epoch": 4.114403657433767, "grad_norm": 14.958807945251465, "learning_rate": 2.9489723354981624e-05, "loss": 2.5543, "step": 1965500 }, { "epoch": 4.115450313159393, "grad_norm": 14.231474876403809, "learning_rate": 2.9484479098491336e-05, "loss": 2.559, "step": 1966000 }, { "epoch": 4.116496968885018, "grad_norm": 14.066886901855469, "learning_rate": 2.947923484200104e-05, "loss": 2.5435, "step": 1966500 }, { "epoch": 4.1175436246106445, "grad_norm": 15.257647514343262, "learning_rate": 2.947399058551075e-05, "loss": 2.5629, "step": 1967000 }, { "epoch": 4.11859028033627, "grad_norm": 15.989460945129395, "learning_rate": 2.946874632902046e-05, "loss": 2.5582, "step": 1967500 }, { "epoch": 4.119636936061895, "grad_norm": 15.917204856872559, "learning_rate": 2.946350207253017e-05, "loss": 2.5494, "step": 1968000 }, { "epoch": 4.120683591787521, "grad_norm": 18.643991470336914, "learning_rate": 2.9458257816039874e-05, "loss": 2.5479, "step": 1968500 }, { "epoch": 4.121730247513146, "grad_norm": 13.895061492919922, "learning_rate": 2.9453013559549582e-05, "loss": 2.566, "step": 1969000 }, { "epoch": 4.122776903238772, "grad_norm": 15.162283897399902, "learning_rate": 2.9447769303059293e-05, "loss": 2.5498, "step": 1969500 }, { "epoch": 4.123823558964397, "grad_norm": 15.535402297973633, "learning_rate": 2.9442525046568997e-05, "loss": 2.5549, "step": 1970000 }, { "epoch": 4.124870214690023, "grad_norm": 13.476780891418457, "learning_rate": 2.9437280790078708e-05, "loss": 2.5623, "step": 1970500 }, { "epoch": 4.125916870415648, "grad_norm": 15.766885757446289, "learning_rate": 2.9432036533588412e-05, "loss": 2.5665, "step": 1971000 }, { "epoch": 4.126963526141274, "grad_norm": 16.01136589050293, "learning_rate": 2.9426792277098124e-05, "loss": 2.5731, "step": 1971500 }, { "epoch": 4.128010181866899, "grad_norm": 13.520637512207031, "learning_rate": 2.942154802060783e-05, "loss": 2.5507, "step": 1972000 }, { "epoch": 4.1290568375925245, "grad_norm": 13.645588874816895, "learning_rate": 2.9416303764117542e-05, "loss": 2.5454, "step": 1972500 }, { "epoch": 4.13010349331815, "grad_norm": 15.453487396240234, "learning_rate": 2.9411059507627247e-05, "loss": 2.5635, "step": 1973000 }, { "epoch": 4.131150149043775, "grad_norm": 13.378671646118164, "learning_rate": 2.9405815251136958e-05, "loss": 2.5402, "step": 1973500 }, { "epoch": 4.132196804769401, "grad_norm": 17.112140655517578, "learning_rate": 2.9400570994646665e-05, "loss": 2.558, "step": 1974000 }, { "epoch": 4.133243460495026, "grad_norm": 14.42751407623291, "learning_rate": 2.939532673815637e-05, "loss": 2.5616, "step": 1974500 }, { "epoch": 4.134290116220652, "grad_norm": 12.537399291992188, "learning_rate": 2.939008248166608e-05, "loss": 2.5616, "step": 1975000 }, { "epoch": 4.135336771946277, "grad_norm": 13.637046813964844, "learning_rate": 2.938483822517579e-05, "loss": 2.5755, "step": 1975500 }, { "epoch": 4.136383427671903, "grad_norm": 14.156824111938477, "learning_rate": 2.93795939686855e-05, "loss": 2.5523, "step": 1976000 }, { "epoch": 4.137430083397528, "grad_norm": 13.22031307220459, "learning_rate": 2.9374349712195204e-05, "loss": 2.5252, "step": 1976500 }, { "epoch": 4.138476739123154, "grad_norm": 13.677162170410156, "learning_rate": 2.9369105455704915e-05, "loss": 2.5549, "step": 1977000 }, { "epoch": 4.139523394848779, "grad_norm": 14.80107593536377, "learning_rate": 2.936386119921462e-05, "loss": 2.564, "step": 1977500 }, { "epoch": 4.1405700505744045, "grad_norm": 13.910484313964844, "learning_rate": 2.935861694272433e-05, "loss": 2.5567, "step": 1978000 }, { "epoch": 4.14161670630003, "grad_norm": 15.468822479248047, "learning_rate": 2.9353372686234038e-05, "loss": 2.543, "step": 1978500 }, { "epoch": 4.142663362025655, "grad_norm": 12.796123504638672, "learning_rate": 2.934812842974375e-05, "loss": 2.5608, "step": 1979000 }, { "epoch": 4.143710017751281, "grad_norm": 11.911914825439453, "learning_rate": 2.9342884173253453e-05, "loss": 2.5403, "step": 1979500 }, { "epoch": 4.144756673476906, "grad_norm": 12.696616172790527, "learning_rate": 2.933763991676316e-05, "loss": 2.5367, "step": 1980000 }, { "epoch": 4.145803329202532, "grad_norm": 13.32083511352539, "learning_rate": 2.9332395660272872e-05, "loss": 2.5668, "step": 1980500 }, { "epoch": 4.146849984928157, "grad_norm": 15.378053665161133, "learning_rate": 2.9327151403782576e-05, "loss": 2.5537, "step": 1981000 }, { "epoch": 4.147896640653783, "grad_norm": 18.444625854492188, "learning_rate": 2.9321907147292288e-05, "loss": 2.5462, "step": 1981500 }, { "epoch": 4.148943296379408, "grad_norm": 15.706521987915039, "learning_rate": 2.9316662890801995e-05, "loss": 2.5525, "step": 1982000 }, { "epoch": 4.149989952105034, "grad_norm": 14.374217987060547, "learning_rate": 2.9311418634311706e-05, "loss": 2.5434, "step": 1982500 }, { "epoch": 4.151036607830659, "grad_norm": 17.647050857543945, "learning_rate": 2.930617437782141e-05, "loss": 2.5668, "step": 1983000 }, { "epoch": 4.1520832635562845, "grad_norm": 13.83571720123291, "learning_rate": 2.9300930121331122e-05, "loss": 2.5414, "step": 1983500 }, { "epoch": 4.153129919281911, "grad_norm": 12.774242401123047, "learning_rate": 2.9295685864840826e-05, "loss": 2.5517, "step": 1984000 }, { "epoch": 4.154176575007536, "grad_norm": 14.338799476623535, "learning_rate": 2.9290441608350537e-05, "loss": 2.5447, "step": 1984500 }, { "epoch": 4.155223230733162, "grad_norm": 13.455757141113281, "learning_rate": 2.9285197351860245e-05, "loss": 2.557, "step": 1985000 }, { "epoch": 4.156269886458787, "grad_norm": 15.550559997558594, "learning_rate": 2.927995309536995e-05, "loss": 2.5561, "step": 1985500 }, { "epoch": 4.157316542184413, "grad_norm": 21.22275733947754, "learning_rate": 2.927470883887966e-05, "loss": 2.5748, "step": 1986000 }, { "epoch": 4.158363197910038, "grad_norm": 14.185630798339844, "learning_rate": 2.9269464582389368e-05, "loss": 2.5484, "step": 1986500 }, { "epoch": 4.159409853635664, "grad_norm": 14.316940307617188, "learning_rate": 2.926422032589908e-05, "loss": 2.5311, "step": 1987000 }, { "epoch": 4.160456509361289, "grad_norm": 14.891915321350098, "learning_rate": 2.9258976069408783e-05, "loss": 2.5617, "step": 1987500 }, { "epoch": 4.1615031650869145, "grad_norm": 15.20866584777832, "learning_rate": 2.9253731812918494e-05, "loss": 2.5541, "step": 1988000 }, { "epoch": 4.16254982081254, "grad_norm": 15.146550178527832, "learning_rate": 2.92484875564282e-05, "loss": 2.5532, "step": 1988500 }, { "epoch": 4.1635964765381654, "grad_norm": 13.981414794921875, "learning_rate": 2.924324329993791e-05, "loss": 2.5494, "step": 1989000 }, { "epoch": 4.164643132263791, "grad_norm": 14.491525650024414, "learning_rate": 2.9237999043447617e-05, "loss": 2.5514, "step": 1989500 }, { "epoch": 4.165689787989416, "grad_norm": 16.71908187866211, "learning_rate": 2.923275478695733e-05, "loss": 2.5328, "step": 1990000 }, { "epoch": 4.166736443715042, "grad_norm": 14.90012264251709, "learning_rate": 2.9227510530467033e-05, "loss": 2.5551, "step": 1990500 }, { "epoch": 4.167783099440667, "grad_norm": 13.24530029296875, "learning_rate": 2.922226627397674e-05, "loss": 2.5458, "step": 1991000 }, { "epoch": 4.168829755166293, "grad_norm": 14.471539497375488, "learning_rate": 2.921702201748645e-05, "loss": 2.5486, "step": 1991500 }, { "epoch": 4.169876410891918, "grad_norm": 13.991703987121582, "learning_rate": 2.9211777760996156e-05, "loss": 2.5442, "step": 1992000 }, { "epoch": 4.170923066617544, "grad_norm": 15.903958320617676, "learning_rate": 2.9206533504505867e-05, "loss": 2.564, "step": 1992500 }, { "epoch": 4.171969722343169, "grad_norm": 15.172286987304688, "learning_rate": 2.9201289248015575e-05, "loss": 2.5478, "step": 1993000 }, { "epoch": 4.1730163780687946, "grad_norm": 16.64340591430664, "learning_rate": 2.9196044991525286e-05, "loss": 2.5549, "step": 1993500 }, { "epoch": 4.17406303379442, "grad_norm": 17.250097274780273, "learning_rate": 2.919080073503499e-05, "loss": 2.5342, "step": 1994000 }, { "epoch": 4.1751096895200455, "grad_norm": 14.025978088378906, "learning_rate": 2.91855564785447e-05, "loss": 2.5556, "step": 1994500 }, { "epoch": 4.176156345245671, "grad_norm": 13.980650901794434, "learning_rate": 2.9180312222054405e-05, "loss": 2.5582, "step": 1995000 }, { "epoch": 4.177203000971296, "grad_norm": 13.129413604736328, "learning_rate": 2.9175067965564116e-05, "loss": 2.5475, "step": 1995500 }, { "epoch": 4.178249656696922, "grad_norm": 13.242536544799805, "learning_rate": 2.9169823709073824e-05, "loss": 2.5469, "step": 1996000 }, { "epoch": 4.179296312422547, "grad_norm": 14.903132438659668, "learning_rate": 2.916457945258353e-05, "loss": 2.5399, "step": 1996500 }, { "epoch": 4.180342968148173, "grad_norm": 12.431674003601074, "learning_rate": 2.915933519609324e-05, "loss": 2.5609, "step": 1997000 }, { "epoch": 4.181389623873798, "grad_norm": 14.37376594543457, "learning_rate": 2.9154090939602947e-05, "loss": 2.5607, "step": 1997500 }, { "epoch": 4.182436279599424, "grad_norm": 13.642045974731445, "learning_rate": 2.9148846683112658e-05, "loss": 2.5599, "step": 1998000 }, { "epoch": 4.183482935325049, "grad_norm": 13.973983764648438, "learning_rate": 2.9143602426622363e-05, "loss": 2.5517, "step": 1998500 }, { "epoch": 4.184529591050675, "grad_norm": 20.59362030029297, "learning_rate": 2.9138358170132074e-05, "loss": 2.5462, "step": 1999000 }, { "epoch": 4.1855762467763, "grad_norm": 16.006731033325195, "learning_rate": 2.9133113913641778e-05, "loss": 2.5683, "step": 1999500 }, { "epoch": 4.1866229025019255, "grad_norm": 16.426897048950195, "learning_rate": 2.9127869657151492e-05, "loss": 2.5484, "step": 2000000 }, { "epoch": 4.187669558227551, "grad_norm": 14.30644702911377, "learning_rate": 2.9122625400661197e-05, "loss": 2.563, "step": 2000500 }, { "epoch": 4.188716213953176, "grad_norm": 13.66461181640625, "learning_rate": 2.9117381144170908e-05, "loss": 2.5457, "step": 2001000 }, { "epoch": 4.189762869678802, "grad_norm": 15.955156326293945, "learning_rate": 2.9112136887680612e-05, "loss": 2.5542, "step": 2001500 }, { "epoch": 4.190809525404428, "grad_norm": 18.605548858642578, "learning_rate": 2.9106892631190323e-05, "loss": 2.5331, "step": 2002000 }, { "epoch": 4.191856181130054, "grad_norm": 16.58286476135254, "learning_rate": 2.910164837470003e-05, "loss": 2.5439, "step": 2002500 }, { "epoch": 4.192902836855679, "grad_norm": 13.441219329833984, "learning_rate": 2.9096404118209735e-05, "loss": 2.5466, "step": 2003000 }, { "epoch": 4.193949492581305, "grad_norm": 14.738204956054688, "learning_rate": 2.9091159861719446e-05, "loss": 2.5478, "step": 2003500 }, { "epoch": 4.19499614830693, "grad_norm": 14.361391067504883, "learning_rate": 2.9085915605229154e-05, "loss": 2.5555, "step": 2004000 }, { "epoch": 4.1960428040325555, "grad_norm": 14.616143226623535, "learning_rate": 2.9080671348738865e-05, "loss": 2.5459, "step": 2004500 }, { "epoch": 4.197089459758181, "grad_norm": 17.36290740966797, "learning_rate": 2.907542709224857e-05, "loss": 2.5477, "step": 2005000 }, { "epoch": 4.198136115483806, "grad_norm": 18.29613494873047, "learning_rate": 2.907018283575828e-05, "loss": 2.5515, "step": 2005500 }, { "epoch": 4.199182771209432, "grad_norm": 13.907181739807129, "learning_rate": 2.9064938579267985e-05, "loss": 2.5612, "step": 2006000 }, { "epoch": 4.200229426935057, "grad_norm": 12.162410736083984, "learning_rate": 2.9059694322777696e-05, "loss": 2.5557, "step": 2006500 }, { "epoch": 4.201276082660683, "grad_norm": 14.061047554016113, "learning_rate": 2.9054450066287404e-05, "loss": 2.5539, "step": 2007000 }, { "epoch": 4.202322738386308, "grad_norm": 15.327051162719727, "learning_rate": 2.9049205809797115e-05, "loss": 2.551, "step": 2007500 }, { "epoch": 4.203369394111934, "grad_norm": 15.046870231628418, "learning_rate": 2.904396155330682e-05, "loss": 2.5429, "step": 2008000 }, { "epoch": 4.204416049837559, "grad_norm": 12.818771362304688, "learning_rate": 2.9038717296816527e-05, "loss": 2.5487, "step": 2008500 }, { "epoch": 4.205462705563185, "grad_norm": 15.062434196472168, "learning_rate": 2.9033473040326238e-05, "loss": 2.5427, "step": 2009000 }, { "epoch": 4.20650936128881, "grad_norm": 13.699359893798828, "learning_rate": 2.9028228783835942e-05, "loss": 2.541, "step": 2009500 }, { "epoch": 4.2075560170144355, "grad_norm": 14.946218490600586, "learning_rate": 2.9022984527345653e-05, "loss": 2.5632, "step": 2010000 }, { "epoch": 4.208602672740061, "grad_norm": 13.122336387634277, "learning_rate": 2.901774027085536e-05, "loss": 2.5216, "step": 2010500 }, { "epoch": 4.209649328465686, "grad_norm": 27.68925666809082, "learning_rate": 2.9012496014365072e-05, "loss": 2.5557, "step": 2011000 }, { "epoch": 4.210695984191312, "grad_norm": 13.202563285827637, "learning_rate": 2.9007251757874776e-05, "loss": 2.5684, "step": 2011500 }, { "epoch": 4.211742639916937, "grad_norm": 15.182026863098145, "learning_rate": 2.9002007501384487e-05, "loss": 2.5554, "step": 2012000 }, { "epoch": 4.212789295642563, "grad_norm": 13.605091094970703, "learning_rate": 2.899676324489419e-05, "loss": 2.5544, "step": 2012500 }, { "epoch": 4.213835951368188, "grad_norm": 13.982193946838379, "learning_rate": 2.8991518988403903e-05, "loss": 2.5535, "step": 2013000 }, { "epoch": 4.214882607093814, "grad_norm": 12.959322929382324, "learning_rate": 2.898627473191361e-05, "loss": 2.5512, "step": 2013500 }, { "epoch": 4.215929262819439, "grad_norm": 11.843743324279785, "learning_rate": 2.8981030475423315e-05, "loss": 2.5578, "step": 2014000 }, { "epoch": 4.216975918545065, "grad_norm": 13.84371280670166, "learning_rate": 2.8975786218933026e-05, "loss": 2.5323, "step": 2014500 }, { "epoch": 4.21802257427069, "grad_norm": 15.423362731933594, "learning_rate": 2.8970541962442733e-05, "loss": 2.5485, "step": 2015000 }, { "epoch": 4.2190692299963155, "grad_norm": 12.79878044128418, "learning_rate": 2.8965297705952444e-05, "loss": 2.5564, "step": 2015500 }, { "epoch": 4.220115885721941, "grad_norm": 13.429879188537598, "learning_rate": 2.896005344946215e-05, "loss": 2.5749, "step": 2016000 }, { "epoch": 4.221162541447566, "grad_norm": 13.93543529510498, "learning_rate": 2.895480919297186e-05, "loss": 2.5363, "step": 2016500 }, { "epoch": 4.222209197173192, "grad_norm": 16.81399154663086, "learning_rate": 2.8949564936481564e-05, "loss": 2.5467, "step": 2017000 }, { "epoch": 4.223255852898817, "grad_norm": 14.823683738708496, "learning_rate": 2.894432067999128e-05, "loss": 2.5426, "step": 2017500 }, { "epoch": 4.224302508624443, "grad_norm": 14.425249099731445, "learning_rate": 2.8939076423500983e-05, "loss": 2.5276, "step": 2018000 }, { "epoch": 4.225349164350068, "grad_norm": 14.990452766418457, "learning_rate": 2.8933832167010694e-05, "loss": 2.5555, "step": 2018500 }, { "epoch": 4.226395820075695, "grad_norm": 15.230101585388184, "learning_rate": 2.8928587910520398e-05, "loss": 2.5501, "step": 2019000 }, { "epoch": 4.22744247580132, "grad_norm": 13.756220817565918, "learning_rate": 2.8923343654030106e-05, "loss": 2.565, "step": 2019500 }, { "epoch": 4.2284891315269455, "grad_norm": 12.961316108703613, "learning_rate": 2.8918099397539817e-05, "loss": 2.5523, "step": 2020000 }, { "epoch": 4.229535787252571, "grad_norm": 13.793363571166992, "learning_rate": 2.891285514104952e-05, "loss": 2.5287, "step": 2020500 }, { "epoch": 4.230582442978196, "grad_norm": 13.087318420410156, "learning_rate": 2.8907610884559232e-05, "loss": 2.5556, "step": 2021000 }, { "epoch": 4.231629098703822, "grad_norm": 13.494669914245605, "learning_rate": 2.890236662806894e-05, "loss": 2.5522, "step": 2021500 }, { "epoch": 4.232675754429447, "grad_norm": 14.00183391571045, "learning_rate": 2.889712237157865e-05, "loss": 2.5442, "step": 2022000 }, { "epoch": 4.233722410155073, "grad_norm": 13.381916046142578, "learning_rate": 2.8891878115088356e-05, "loss": 2.5397, "step": 2022500 }, { "epoch": 4.234769065880698, "grad_norm": 13.43486213684082, "learning_rate": 2.8886633858598067e-05, "loss": 2.5487, "step": 2023000 }, { "epoch": 4.235815721606324, "grad_norm": 14.329910278320312, "learning_rate": 2.888138960210777e-05, "loss": 2.5194, "step": 2023500 }, { "epoch": 4.236862377331949, "grad_norm": 16.08717155456543, "learning_rate": 2.8876145345617482e-05, "loss": 2.5385, "step": 2024000 }, { "epoch": 4.237909033057575, "grad_norm": 14.47070598602295, "learning_rate": 2.887090108912719e-05, "loss": 2.5541, "step": 2024500 }, { "epoch": 4.2389556887832, "grad_norm": 14.210230827331543, "learning_rate": 2.8865656832636894e-05, "loss": 2.5463, "step": 2025000 }, { "epoch": 4.2400023445088255, "grad_norm": 15.17423152923584, "learning_rate": 2.8860412576146605e-05, "loss": 2.5521, "step": 2025500 }, { "epoch": 4.241049000234451, "grad_norm": 14.841958999633789, "learning_rate": 2.8855168319656313e-05, "loss": 2.5271, "step": 2026000 }, { "epoch": 4.242095655960076, "grad_norm": 14.6624174118042, "learning_rate": 2.8849924063166024e-05, "loss": 2.5402, "step": 2026500 }, { "epoch": 4.243142311685702, "grad_norm": 15.683831214904785, "learning_rate": 2.8844679806675728e-05, "loss": 2.5347, "step": 2027000 }, { "epoch": 4.244188967411327, "grad_norm": 15.740399360656738, "learning_rate": 2.883943555018544e-05, "loss": 2.5505, "step": 2027500 }, { "epoch": 4.245235623136953, "grad_norm": 16.134288787841797, "learning_rate": 2.8834191293695147e-05, "loss": 2.5385, "step": 2028000 }, { "epoch": 4.246282278862578, "grad_norm": 14.625067710876465, "learning_rate": 2.8828947037204858e-05, "loss": 2.5671, "step": 2028500 }, { "epoch": 4.247328934588204, "grad_norm": 13.339335441589355, "learning_rate": 2.8823702780714562e-05, "loss": 2.5562, "step": 2029000 }, { "epoch": 4.248375590313829, "grad_norm": 13.822732925415039, "learning_rate": 2.8818458524224273e-05, "loss": 2.5579, "step": 2029500 }, { "epoch": 4.249422246039455, "grad_norm": 14.871743202209473, "learning_rate": 2.8813214267733978e-05, "loss": 2.5363, "step": 2030000 }, { "epoch": 4.25046890176508, "grad_norm": 12.203572273254395, "learning_rate": 2.8807970011243685e-05, "loss": 2.5516, "step": 2030500 }, { "epoch": 4.2515155574907055, "grad_norm": 18.46527099609375, "learning_rate": 2.8802725754753396e-05, "loss": 2.5441, "step": 2031000 }, { "epoch": 4.252562213216331, "grad_norm": 16.63855743408203, "learning_rate": 2.87974814982631e-05, "loss": 2.5354, "step": 2031500 }, { "epoch": 4.2536088689419564, "grad_norm": 14.30959415435791, "learning_rate": 2.8792237241772812e-05, "loss": 2.5445, "step": 2032000 }, { "epoch": 4.254655524667582, "grad_norm": 18.90511703491211, "learning_rate": 2.878699298528252e-05, "loss": 2.5544, "step": 2032500 }, { "epoch": 4.255702180393207, "grad_norm": 15.362434387207031, "learning_rate": 2.878174872879223e-05, "loss": 2.5522, "step": 2033000 }, { "epoch": 4.256748836118833, "grad_norm": 14.544995307922363, "learning_rate": 2.8776504472301935e-05, "loss": 2.5346, "step": 2033500 }, { "epoch": 4.257795491844458, "grad_norm": 13.464016914367676, "learning_rate": 2.8771260215811646e-05, "loss": 2.5329, "step": 2034000 }, { "epoch": 4.258842147570084, "grad_norm": 13.239256858825684, "learning_rate": 2.876601595932135e-05, "loss": 2.561, "step": 2034500 }, { "epoch": 4.259888803295709, "grad_norm": 14.911470413208008, "learning_rate": 2.8760771702831065e-05, "loss": 2.5554, "step": 2035000 }, { "epoch": 4.260935459021335, "grad_norm": 17.564111709594727, "learning_rate": 2.875552744634077e-05, "loss": 2.5371, "step": 2035500 }, { "epoch": 4.26198211474696, "grad_norm": 12.862536430358887, "learning_rate": 2.875028318985048e-05, "loss": 2.5453, "step": 2036000 }, { "epoch": 4.2630287704725855, "grad_norm": 14.937185287475586, "learning_rate": 2.8745038933360184e-05, "loss": 2.5364, "step": 2036500 }, { "epoch": 4.264075426198211, "grad_norm": 14.076891899108887, "learning_rate": 2.8739794676869892e-05, "loss": 2.5375, "step": 2037000 }, { "epoch": 4.265122081923837, "grad_norm": 13.558475494384766, "learning_rate": 2.8734550420379603e-05, "loss": 2.5324, "step": 2037500 }, { "epoch": 4.266168737649463, "grad_norm": 13.09357738494873, "learning_rate": 2.8729306163889307e-05, "loss": 2.5309, "step": 2038000 }, { "epoch": 4.267215393375088, "grad_norm": 13.60975170135498, "learning_rate": 2.872406190739902e-05, "loss": 2.5215, "step": 2038500 }, { "epoch": 4.268262049100714, "grad_norm": 14.032530784606934, "learning_rate": 2.8718817650908726e-05, "loss": 2.5345, "step": 2039000 }, { "epoch": 4.269308704826339, "grad_norm": 13.35231876373291, "learning_rate": 2.8713573394418437e-05, "loss": 2.5522, "step": 2039500 }, { "epoch": 4.270355360551965, "grad_norm": 14.854559898376465, "learning_rate": 2.870832913792814e-05, "loss": 2.5145, "step": 2040000 }, { "epoch": 4.27140201627759, "grad_norm": 13.325324058532715, "learning_rate": 2.8703084881437853e-05, "loss": 2.5532, "step": 2040500 }, { "epoch": 4.2724486720032155, "grad_norm": 14.847299575805664, "learning_rate": 2.8697840624947557e-05, "loss": 2.5416, "step": 2041000 }, { "epoch": 4.273495327728841, "grad_norm": 12.785348892211914, "learning_rate": 2.8692596368457268e-05, "loss": 2.5504, "step": 2041500 }, { "epoch": 4.2745419834544665, "grad_norm": 15.31374740600586, "learning_rate": 2.8687352111966976e-05, "loss": 2.5321, "step": 2042000 }, { "epoch": 4.275588639180092, "grad_norm": 14.819722175598145, "learning_rate": 2.868210785547668e-05, "loss": 2.5443, "step": 2042500 }, { "epoch": 4.276635294905717, "grad_norm": 16.580036163330078, "learning_rate": 2.867686359898639e-05, "loss": 2.5585, "step": 2043000 }, { "epoch": 4.277681950631343, "grad_norm": 15.138056755065918, "learning_rate": 2.86716193424961e-05, "loss": 2.5469, "step": 2043500 }, { "epoch": 4.278728606356968, "grad_norm": 25.770769119262695, "learning_rate": 2.866637508600581e-05, "loss": 2.5477, "step": 2044000 }, { "epoch": 4.279775262082594, "grad_norm": 14.374612808227539, "learning_rate": 2.8661130829515514e-05, "loss": 2.5555, "step": 2044500 }, { "epoch": 4.280821917808219, "grad_norm": 17.252859115600586, "learning_rate": 2.8655886573025225e-05, "loss": 2.5293, "step": 2045000 }, { "epoch": 4.281868573533845, "grad_norm": 13.941069602966309, "learning_rate": 2.8650642316534933e-05, "loss": 2.5552, "step": 2045500 }, { "epoch": 4.28291522925947, "grad_norm": 12.386725425720215, "learning_rate": 2.8645398060044644e-05, "loss": 2.5283, "step": 2046000 }, { "epoch": 4.283961884985096, "grad_norm": 15.354578018188477, "learning_rate": 2.864015380355435e-05, "loss": 2.5408, "step": 2046500 }, { "epoch": 4.285008540710721, "grad_norm": 14.50407886505127, "learning_rate": 2.863490954706406e-05, "loss": 2.5318, "step": 2047000 }, { "epoch": 4.2860551964363465, "grad_norm": 17.439966201782227, "learning_rate": 2.8629665290573764e-05, "loss": 2.5437, "step": 2047500 }, { "epoch": 4.287101852161972, "grad_norm": 15.14749813079834, "learning_rate": 2.862442103408347e-05, "loss": 2.538, "step": 2048000 }, { "epoch": 4.288148507887597, "grad_norm": 15.274859428405762, "learning_rate": 2.8619176777593183e-05, "loss": 2.5479, "step": 2048500 }, { "epoch": 4.289195163613223, "grad_norm": 15.35230541229248, "learning_rate": 2.8613932521102887e-05, "loss": 2.5486, "step": 2049000 }, { "epoch": 4.290241819338848, "grad_norm": 16.244054794311523, "learning_rate": 2.8608688264612598e-05, "loss": 2.5341, "step": 2049500 }, { "epoch": 4.291288475064474, "grad_norm": 15.007954597473145, "learning_rate": 2.8603444008122306e-05, "loss": 2.53, "step": 2050000 }, { "epoch": 4.292335130790099, "grad_norm": 15.140063285827637, "learning_rate": 2.8598199751632017e-05, "loss": 2.5327, "step": 2050500 }, { "epoch": 4.293381786515725, "grad_norm": 13.231292724609375, "learning_rate": 2.859295549514172e-05, "loss": 2.5507, "step": 2051000 }, { "epoch": 4.29442844224135, "grad_norm": 15.759570121765137, "learning_rate": 2.8587711238651432e-05, "loss": 2.5435, "step": 2051500 }, { "epoch": 4.295475097966976, "grad_norm": 17.001253128051758, "learning_rate": 2.8582466982161136e-05, "loss": 2.5444, "step": 2052000 }, { "epoch": 4.296521753692601, "grad_norm": 14.057848930358887, "learning_rate": 2.8577222725670847e-05, "loss": 2.5579, "step": 2052500 }, { "epoch": 4.2975684094182265, "grad_norm": 17.64349937438965, "learning_rate": 2.8571978469180555e-05, "loss": 2.541, "step": 2053000 }, { "epoch": 4.298615065143853, "grad_norm": 14.674941062927246, "learning_rate": 2.856673421269026e-05, "loss": 2.5417, "step": 2053500 }, { "epoch": 4.299661720869478, "grad_norm": 12.615180015563965, "learning_rate": 2.856148995619997e-05, "loss": 2.5509, "step": 2054000 }, { "epoch": 4.300708376595104, "grad_norm": 15.894262313842773, "learning_rate": 2.8556245699709678e-05, "loss": 2.5513, "step": 2054500 }, { "epoch": 4.301755032320729, "grad_norm": 13.964629173278809, "learning_rate": 2.855100144321939e-05, "loss": 2.555, "step": 2055000 }, { "epoch": 4.302801688046355, "grad_norm": 16.451560974121094, "learning_rate": 2.8545757186729094e-05, "loss": 2.5191, "step": 2055500 }, { "epoch": 4.30384834377198, "grad_norm": 15.576972961425781, "learning_rate": 2.8540512930238805e-05, "loss": 2.5332, "step": 2056000 }, { "epoch": 4.304894999497606, "grad_norm": 17.028518676757812, "learning_rate": 2.8535268673748512e-05, "loss": 2.5504, "step": 2056500 }, { "epoch": 4.305941655223231, "grad_norm": 14.133017539978027, "learning_rate": 2.8530024417258223e-05, "loss": 2.5448, "step": 2057000 }, { "epoch": 4.3069883109488565, "grad_norm": 15.877196311950684, "learning_rate": 2.8524780160767928e-05, "loss": 2.5188, "step": 2057500 }, { "epoch": 4.308034966674482, "grad_norm": 11.831775665283203, "learning_rate": 2.851953590427764e-05, "loss": 2.5519, "step": 2058000 }, { "epoch": 4.309081622400107, "grad_norm": 14.442946434020996, "learning_rate": 2.8514291647787343e-05, "loss": 2.5264, "step": 2058500 }, { "epoch": 4.310128278125733, "grad_norm": 13.7389497756958, "learning_rate": 2.850904739129705e-05, "loss": 2.5403, "step": 2059000 }, { "epoch": 4.311174933851358, "grad_norm": 14.653633117675781, "learning_rate": 2.8503803134806762e-05, "loss": 2.5458, "step": 2059500 }, { "epoch": 4.312221589576984, "grad_norm": 13.687685012817383, "learning_rate": 2.8498558878316466e-05, "loss": 2.5373, "step": 2060000 }, { "epoch": 4.313268245302609, "grad_norm": 13.196674346923828, "learning_rate": 2.8493314621826177e-05, "loss": 2.5256, "step": 2060500 }, { "epoch": 4.314314901028235, "grad_norm": 14.132308959960938, "learning_rate": 2.8488070365335885e-05, "loss": 2.5119, "step": 2061000 }, { "epoch": 4.31536155675386, "grad_norm": 15.080708503723145, "learning_rate": 2.8482826108845596e-05, "loss": 2.5291, "step": 2061500 }, { "epoch": 4.316408212479486, "grad_norm": 15.342007637023926, "learning_rate": 2.84775818523553e-05, "loss": 2.5511, "step": 2062000 }, { "epoch": 4.317454868205111, "grad_norm": 14.000569343566895, "learning_rate": 2.847233759586501e-05, "loss": 2.5284, "step": 2062500 }, { "epoch": 4.3185015239307365, "grad_norm": 14.906901359558105, "learning_rate": 2.846709333937472e-05, "loss": 2.5496, "step": 2063000 }, { "epoch": 4.319548179656362, "grad_norm": 13.030643463134766, "learning_rate": 2.846184908288443e-05, "loss": 2.5405, "step": 2063500 }, { "epoch": 4.320594835381987, "grad_norm": 15.335869789123535, "learning_rate": 2.8456604826394135e-05, "loss": 2.5512, "step": 2064000 }, { "epoch": 4.321641491107613, "grad_norm": 18.145004272460938, "learning_rate": 2.845136056990384e-05, "loss": 2.5373, "step": 2064500 }, { "epoch": 4.322688146833238, "grad_norm": 14.296704292297363, "learning_rate": 2.844611631341355e-05, "loss": 2.5553, "step": 2065000 }, { "epoch": 4.323734802558864, "grad_norm": 14.79932975769043, "learning_rate": 2.8440872056923258e-05, "loss": 2.5136, "step": 2065500 }, { "epoch": 4.324781458284489, "grad_norm": 16.091903686523438, "learning_rate": 2.843562780043297e-05, "loss": 2.5398, "step": 2066000 }, { "epoch": 4.325828114010115, "grad_norm": 14.54327392578125, "learning_rate": 2.8430383543942673e-05, "loss": 2.5314, "step": 2066500 }, { "epoch": 4.32687476973574, "grad_norm": 12.204386711120605, "learning_rate": 2.8425139287452384e-05, "loss": 2.5401, "step": 2067000 }, { "epoch": 4.327921425461366, "grad_norm": 13.614652633666992, "learning_rate": 2.8419895030962092e-05, "loss": 2.5389, "step": 2067500 }, { "epoch": 4.328968081186991, "grad_norm": 16.22732925415039, "learning_rate": 2.8414650774471803e-05, "loss": 2.5388, "step": 2068000 }, { "epoch": 4.3300147369126165, "grad_norm": 14.745709419250488, "learning_rate": 2.8409406517981507e-05, "loss": 2.5508, "step": 2068500 }, { "epoch": 4.331061392638242, "grad_norm": 16.19746208190918, "learning_rate": 2.8404162261491218e-05, "loss": 2.5465, "step": 2069000 }, { "epoch": 4.332108048363867, "grad_norm": 15.133536338806152, "learning_rate": 2.8398918005000923e-05, "loss": 2.5475, "step": 2069500 }, { "epoch": 4.333154704089493, "grad_norm": 15.242587089538574, "learning_rate": 2.8393673748510634e-05, "loss": 2.5326, "step": 2070000 }, { "epoch": 4.334201359815118, "grad_norm": 14.703747749328613, "learning_rate": 2.838842949202034e-05, "loss": 2.5213, "step": 2070500 }, { "epoch": 4.335248015540744, "grad_norm": 17.25076675415039, "learning_rate": 2.8383185235530046e-05, "loss": 2.5569, "step": 2071000 }, { "epoch": 4.336294671266369, "grad_norm": 14.031189918518066, "learning_rate": 2.8377940979039757e-05, "loss": 2.556, "step": 2071500 }, { "epoch": 4.337341326991995, "grad_norm": 13.104677200317383, "learning_rate": 2.8372696722549464e-05, "loss": 2.5401, "step": 2072000 }, { "epoch": 4.338387982717621, "grad_norm": 15.71812629699707, "learning_rate": 2.8367452466059175e-05, "loss": 2.5423, "step": 2072500 }, { "epoch": 4.3394346384432465, "grad_norm": 13.375164985656738, "learning_rate": 2.836220820956888e-05, "loss": 2.5261, "step": 2073000 }, { "epoch": 4.340481294168872, "grad_norm": 12.895137786865234, "learning_rate": 2.835696395307859e-05, "loss": 2.5198, "step": 2073500 }, { "epoch": 4.341527949894497, "grad_norm": 16.058128356933594, "learning_rate": 2.83517196965883e-05, "loss": 2.5338, "step": 2074000 }, { "epoch": 4.342574605620123, "grad_norm": 14.143261909484863, "learning_rate": 2.834647544009801e-05, "loss": 2.5322, "step": 2074500 }, { "epoch": 4.343621261345748, "grad_norm": 14.867342948913574, "learning_rate": 2.8341231183607714e-05, "loss": 2.5595, "step": 2075000 }, { "epoch": 4.344667917071374, "grad_norm": 16.025510787963867, "learning_rate": 2.8335986927117425e-05, "loss": 2.5345, "step": 2075500 }, { "epoch": 4.345714572796999, "grad_norm": 14.209750175476074, "learning_rate": 2.833074267062713e-05, "loss": 2.5347, "step": 2076000 }, { "epoch": 4.346761228522625, "grad_norm": 13.863821029663086, "learning_rate": 2.8325498414136837e-05, "loss": 2.5263, "step": 2076500 }, { "epoch": 4.34780788424825, "grad_norm": 15.423812866210938, "learning_rate": 2.8320254157646548e-05, "loss": 2.5332, "step": 2077000 }, { "epoch": 4.348854539973876, "grad_norm": 16.47415542602539, "learning_rate": 2.8315009901156252e-05, "loss": 2.5357, "step": 2077500 }, { "epoch": 4.349901195699501, "grad_norm": 13.225048065185547, "learning_rate": 2.8309765644665963e-05, "loss": 2.5336, "step": 2078000 }, { "epoch": 4.3509478514251265, "grad_norm": 17.11936378479004, "learning_rate": 2.830452138817567e-05, "loss": 2.5463, "step": 2078500 }, { "epoch": 4.351994507150752, "grad_norm": 16.606233596801758, "learning_rate": 2.8299277131685382e-05, "loss": 2.5541, "step": 2079000 }, { "epoch": 4.353041162876377, "grad_norm": 16.09097671508789, "learning_rate": 2.8294032875195087e-05, "loss": 2.5564, "step": 2079500 }, { "epoch": 4.354087818602003, "grad_norm": 14.737933158874512, "learning_rate": 2.8288788618704798e-05, "loss": 2.5297, "step": 2080000 }, { "epoch": 4.355134474327628, "grad_norm": 14.252581596374512, "learning_rate": 2.8283544362214502e-05, "loss": 2.5466, "step": 2080500 }, { "epoch": 4.356181130053254, "grad_norm": 13.97681999206543, "learning_rate": 2.8278300105724216e-05, "loss": 2.5394, "step": 2081000 }, { "epoch": 4.357227785778879, "grad_norm": 13.583622932434082, "learning_rate": 2.827305584923392e-05, "loss": 2.5412, "step": 2081500 }, { "epoch": 4.358274441504505, "grad_norm": 13.420355796813965, "learning_rate": 2.8267811592743625e-05, "loss": 2.5313, "step": 2082000 }, { "epoch": 4.35932109723013, "grad_norm": 14.82848072052002, "learning_rate": 2.8262567336253336e-05, "loss": 2.5329, "step": 2082500 }, { "epoch": 4.360367752955756, "grad_norm": 12.978464126586914, "learning_rate": 2.8257323079763044e-05, "loss": 2.555, "step": 2083000 }, { "epoch": 4.361414408681381, "grad_norm": 15.013078689575195, "learning_rate": 2.8252078823272755e-05, "loss": 2.5274, "step": 2083500 }, { "epoch": 4.3624610644070065, "grad_norm": 14.845885276794434, "learning_rate": 2.824683456678246e-05, "loss": 2.5359, "step": 2084000 }, { "epoch": 4.363507720132632, "grad_norm": 15.712628364562988, "learning_rate": 2.824159031029217e-05, "loss": 2.5113, "step": 2084500 }, { "epoch": 4.3645543758582575, "grad_norm": 14.696927070617676, "learning_rate": 2.8236346053801878e-05, "loss": 2.5248, "step": 2085000 }, { "epoch": 4.365601031583883, "grad_norm": 14.127656936645508, "learning_rate": 2.823110179731159e-05, "loss": 2.5335, "step": 2085500 }, { "epoch": 4.366647687309508, "grad_norm": 15.490791320800781, "learning_rate": 2.8225857540821293e-05, "loss": 2.5396, "step": 2086000 }, { "epoch": 4.367694343035134, "grad_norm": 15.437350273132324, "learning_rate": 2.8220613284331004e-05, "loss": 2.5334, "step": 2086500 }, { "epoch": 4.368740998760759, "grad_norm": 14.15667724609375, "learning_rate": 2.821536902784071e-05, "loss": 2.5524, "step": 2087000 }, { "epoch": 4.369787654486385, "grad_norm": 16.40025520324707, "learning_rate": 2.8210124771350416e-05, "loss": 2.5506, "step": 2087500 }, { "epoch": 4.37083431021201, "grad_norm": 13.44857406616211, "learning_rate": 2.8204880514860127e-05, "loss": 2.5427, "step": 2088000 }, { "epoch": 4.3718809659376365, "grad_norm": 14.278508186340332, "learning_rate": 2.8199636258369832e-05, "loss": 2.5327, "step": 2088500 }, { "epoch": 4.372927621663262, "grad_norm": 15.897112846374512, "learning_rate": 2.8194392001879543e-05, "loss": 2.5316, "step": 2089000 }, { "epoch": 4.373974277388887, "grad_norm": 12.746085166931152, "learning_rate": 2.818914774538925e-05, "loss": 2.5382, "step": 2089500 }, { "epoch": 4.375020933114513, "grad_norm": 15.513036727905273, "learning_rate": 2.818390348889896e-05, "loss": 2.538, "step": 2090000 }, { "epoch": 4.376067588840138, "grad_norm": 14.293712615966797, "learning_rate": 2.8178659232408666e-05, "loss": 2.5499, "step": 2090500 }, { "epoch": 4.377114244565764, "grad_norm": 14.343297958374023, "learning_rate": 2.8173414975918377e-05, "loss": 2.5409, "step": 2091000 }, { "epoch": 4.378160900291389, "grad_norm": 16.65743637084961, "learning_rate": 2.8168170719428085e-05, "loss": 2.5413, "step": 2091500 }, { "epoch": 4.379207556017015, "grad_norm": 13.550944328308105, "learning_rate": 2.8162926462937796e-05, "loss": 2.5365, "step": 2092000 }, { "epoch": 4.38025421174264, "grad_norm": 15.227286338806152, "learning_rate": 2.81576822064475e-05, "loss": 2.5241, "step": 2092500 }, { "epoch": 4.381300867468266, "grad_norm": 16.679019927978516, "learning_rate": 2.8152437949957204e-05, "loss": 2.5479, "step": 2093000 }, { "epoch": 4.382347523193891, "grad_norm": 13.565979957580566, "learning_rate": 2.8147193693466915e-05, "loss": 2.5433, "step": 2093500 }, { "epoch": 4.3833941789195165, "grad_norm": 13.287744522094727, "learning_rate": 2.8141949436976623e-05, "loss": 2.5288, "step": 2094000 }, { "epoch": 4.384440834645142, "grad_norm": 13.816642761230469, "learning_rate": 2.8136705180486334e-05, "loss": 2.5359, "step": 2094500 }, { "epoch": 4.3854874903707675, "grad_norm": 12.921409606933594, "learning_rate": 2.813146092399604e-05, "loss": 2.5356, "step": 2095000 }, { "epoch": 4.386534146096393, "grad_norm": 15.111021995544434, "learning_rate": 2.812621666750575e-05, "loss": 2.5275, "step": 2095500 }, { "epoch": 4.387580801822018, "grad_norm": 14.402771949768066, "learning_rate": 2.8120972411015457e-05, "loss": 2.5641, "step": 2096000 }, { "epoch": 4.388627457547644, "grad_norm": 15.826183319091797, "learning_rate": 2.811572815452517e-05, "loss": 2.5078, "step": 2096500 }, { "epoch": 4.389674113273269, "grad_norm": 16.5266170501709, "learning_rate": 2.8110483898034873e-05, "loss": 2.5289, "step": 2097000 }, { "epoch": 4.390720768998895, "grad_norm": 14.006922721862793, "learning_rate": 2.8105239641544584e-05, "loss": 2.5132, "step": 2097500 }, { "epoch": 4.39176742472452, "grad_norm": 13.240680694580078, "learning_rate": 2.8099995385054288e-05, "loss": 2.5477, "step": 2098000 }, { "epoch": 4.392814080450146, "grad_norm": 14.196012496948242, "learning_rate": 2.8094751128564002e-05, "loss": 2.56, "step": 2098500 }, { "epoch": 4.393860736175771, "grad_norm": 13.763630867004395, "learning_rate": 2.8089506872073707e-05, "loss": 2.537, "step": 2099000 }, { "epoch": 4.394907391901397, "grad_norm": 14.374702453613281, "learning_rate": 2.808426261558341e-05, "loss": 2.5285, "step": 2099500 }, { "epoch": 4.395954047627022, "grad_norm": 18.215112686157227, "learning_rate": 2.8079018359093122e-05, "loss": 2.5444, "step": 2100000 }, { "epoch": 4.3970007033526475, "grad_norm": 14.89549732208252, "learning_rate": 2.807377410260283e-05, "loss": 2.543, "step": 2100500 }, { "epoch": 4.398047359078273, "grad_norm": 14.366068840026855, "learning_rate": 2.806852984611254e-05, "loss": 2.5452, "step": 2101000 }, { "epoch": 4.399094014803898, "grad_norm": 16.85660743713379, "learning_rate": 2.8063285589622245e-05, "loss": 2.5396, "step": 2101500 }, { "epoch": 4.400140670529524, "grad_norm": 13.897644996643066, "learning_rate": 2.8058041333131956e-05, "loss": 2.5293, "step": 2102000 }, { "epoch": 4.401187326255149, "grad_norm": 13.920140266418457, "learning_rate": 2.8052797076641664e-05, "loss": 2.5357, "step": 2102500 }, { "epoch": 4.402233981980775, "grad_norm": 14.788713455200195, "learning_rate": 2.8047552820151375e-05, "loss": 2.5211, "step": 2103000 }, { "epoch": 4.4032806377064, "grad_norm": 15.30894660949707, "learning_rate": 2.804230856366108e-05, "loss": 2.5409, "step": 2103500 }, { "epoch": 4.404327293432026, "grad_norm": 14.27741527557373, "learning_rate": 2.803706430717079e-05, "loss": 2.5403, "step": 2104000 }, { "epoch": 4.405373949157651, "grad_norm": 14.594383239746094, "learning_rate": 2.8031820050680495e-05, "loss": 2.5261, "step": 2104500 }, { "epoch": 4.406420604883277, "grad_norm": 14.351943969726562, "learning_rate": 2.8026575794190202e-05, "loss": 2.5197, "step": 2105000 }, { "epoch": 4.407467260608902, "grad_norm": 14.500967025756836, "learning_rate": 2.8021331537699914e-05, "loss": 2.5148, "step": 2105500 }, { "epoch": 4.4085139163345275, "grad_norm": 15.114404678344727, "learning_rate": 2.8016087281209618e-05, "loss": 2.5257, "step": 2106000 }, { "epoch": 4.409560572060153, "grad_norm": 15.197193145751953, "learning_rate": 2.801084302471933e-05, "loss": 2.5221, "step": 2106500 }, { "epoch": 4.410607227785779, "grad_norm": 14.350997924804688, "learning_rate": 2.8005598768229037e-05, "loss": 2.5315, "step": 2107000 }, { "epoch": 4.411653883511405, "grad_norm": 14.34862232208252, "learning_rate": 2.8000354511738748e-05, "loss": 2.5284, "step": 2107500 }, { "epoch": 4.41270053923703, "grad_norm": 13.914358139038086, "learning_rate": 2.7995110255248452e-05, "loss": 2.5417, "step": 2108000 }, { "epoch": 4.413747194962656, "grad_norm": 13.941160202026367, "learning_rate": 2.7989865998758163e-05, "loss": 2.5313, "step": 2108500 }, { "epoch": 4.414793850688281, "grad_norm": 14.990921974182129, "learning_rate": 2.798462174226787e-05, "loss": 2.5294, "step": 2109000 }, { "epoch": 4.415840506413907, "grad_norm": 14.444406509399414, "learning_rate": 2.7979377485777582e-05, "loss": 2.536, "step": 2109500 }, { "epoch": 4.416887162139532, "grad_norm": 16.198923110961914, "learning_rate": 2.7974133229287286e-05, "loss": 2.5367, "step": 2110000 }, { "epoch": 4.4179338178651575, "grad_norm": 14.58454418182373, "learning_rate": 2.796888897279699e-05, "loss": 2.5225, "step": 2110500 }, { "epoch": 4.418980473590783, "grad_norm": 14.761541366577148, "learning_rate": 2.79636447163067e-05, "loss": 2.5534, "step": 2111000 }, { "epoch": 4.420027129316408, "grad_norm": 14.654532432556152, "learning_rate": 2.795840045981641e-05, "loss": 2.518, "step": 2111500 }, { "epoch": 4.421073785042034, "grad_norm": 15.89733600616455, "learning_rate": 2.795315620332612e-05, "loss": 2.5342, "step": 2112000 }, { "epoch": 4.422120440767659, "grad_norm": 15.742851257324219, "learning_rate": 2.7947911946835825e-05, "loss": 2.5414, "step": 2112500 }, { "epoch": 4.423167096493285, "grad_norm": 14.256978988647461, "learning_rate": 2.7942667690345536e-05, "loss": 2.541, "step": 2113000 }, { "epoch": 4.42421375221891, "grad_norm": 19.262983322143555, "learning_rate": 2.7937423433855243e-05, "loss": 2.5287, "step": 2113500 }, { "epoch": 4.425260407944536, "grad_norm": 13.63176155090332, "learning_rate": 2.7932179177364954e-05, "loss": 2.5407, "step": 2114000 }, { "epoch": 4.426307063670161, "grad_norm": 14.63033390045166, "learning_rate": 2.792693492087466e-05, "loss": 2.5006, "step": 2114500 }, { "epoch": 4.427353719395787, "grad_norm": 32.19702911376953, "learning_rate": 2.792169066438437e-05, "loss": 2.5347, "step": 2115000 }, { "epoch": 4.428400375121412, "grad_norm": 13.030511856079102, "learning_rate": 2.7916446407894074e-05, "loss": 2.5263, "step": 2115500 }, { "epoch": 4.4294470308470375, "grad_norm": 13.769142150878906, "learning_rate": 2.7911202151403782e-05, "loss": 2.5255, "step": 2116000 }, { "epoch": 4.430493686572663, "grad_norm": 16.387514114379883, "learning_rate": 2.7905957894913493e-05, "loss": 2.5258, "step": 2116500 }, { "epoch": 4.431540342298288, "grad_norm": 14.877472877502441, "learning_rate": 2.7900713638423197e-05, "loss": 2.5298, "step": 2117000 }, { "epoch": 4.432586998023914, "grad_norm": 15.22683334350586, "learning_rate": 2.789546938193291e-05, "loss": 2.5426, "step": 2117500 }, { "epoch": 4.433633653749539, "grad_norm": 14.50991439819336, "learning_rate": 2.7890225125442616e-05, "loss": 2.5203, "step": 2118000 }, { "epoch": 4.434680309475165, "grad_norm": 15.581316947937012, "learning_rate": 2.7884980868952327e-05, "loss": 2.5401, "step": 2118500 }, { "epoch": 4.43572696520079, "grad_norm": 13.102250099182129, "learning_rate": 2.787973661246203e-05, "loss": 2.5419, "step": 2119000 }, { "epoch": 4.436773620926416, "grad_norm": 14.840591430664062, "learning_rate": 2.7874492355971742e-05, "loss": 2.531, "step": 2119500 }, { "epoch": 4.437820276652041, "grad_norm": 16.394123077392578, "learning_rate": 2.786924809948145e-05, "loss": 2.5387, "step": 2120000 }, { "epoch": 4.438866932377667, "grad_norm": 14.909346580505371, "learning_rate": 2.786400384299116e-05, "loss": 2.5418, "step": 2120500 }, { "epoch": 4.439913588103292, "grad_norm": 15.81054401397705, "learning_rate": 2.7858759586500866e-05, "loss": 2.5282, "step": 2121000 }, { "epoch": 4.4409602438289175, "grad_norm": 13.395647048950195, "learning_rate": 2.785351533001057e-05, "loss": 2.533, "step": 2121500 }, { "epoch": 4.442006899554543, "grad_norm": 12.064144134521484, "learning_rate": 2.784827107352028e-05, "loss": 2.5225, "step": 2122000 }, { "epoch": 4.443053555280168, "grad_norm": 18.391735076904297, "learning_rate": 2.784302681702999e-05, "loss": 2.5282, "step": 2122500 }, { "epoch": 4.444100211005794, "grad_norm": 14.464149475097656, "learning_rate": 2.78377825605397e-05, "loss": 2.5387, "step": 2123000 }, { "epoch": 4.44514686673142, "grad_norm": 14.991941452026367, "learning_rate": 2.7832538304049404e-05, "loss": 2.5252, "step": 2123500 }, { "epoch": 4.446193522457046, "grad_norm": 17.004547119140625, "learning_rate": 2.7827294047559115e-05, "loss": 2.54, "step": 2124000 }, { "epoch": 4.447240178182671, "grad_norm": 16.032732009887695, "learning_rate": 2.7822049791068823e-05, "loss": 2.525, "step": 2124500 }, { "epoch": 4.448286833908297, "grad_norm": 13.215410232543945, "learning_rate": 2.7816805534578534e-05, "loss": 2.5244, "step": 2125000 }, { "epoch": 4.449333489633922, "grad_norm": 15.856378555297852, "learning_rate": 2.7811561278088238e-05, "loss": 2.522, "step": 2125500 }, { "epoch": 4.4503801453595475, "grad_norm": 15.912967681884766, "learning_rate": 2.780631702159795e-05, "loss": 2.5307, "step": 2126000 }, { "epoch": 4.451426801085173, "grad_norm": 14.018474578857422, "learning_rate": 2.7801072765107657e-05, "loss": 2.5348, "step": 2126500 }, { "epoch": 4.452473456810798, "grad_norm": 13.046879768371582, "learning_rate": 2.779582850861736e-05, "loss": 2.5567, "step": 2127000 }, { "epoch": 4.453520112536424, "grad_norm": 16.102209091186523, "learning_rate": 2.7790584252127072e-05, "loss": 2.5346, "step": 2127500 }, { "epoch": 4.454566768262049, "grad_norm": 14.45245361328125, "learning_rate": 2.7785339995636777e-05, "loss": 2.5461, "step": 2128000 }, { "epoch": 4.455613423987675, "grad_norm": 14.310155868530273, "learning_rate": 2.7780095739146488e-05, "loss": 2.5395, "step": 2128500 }, { "epoch": 4.4566600797133, "grad_norm": 13.809419631958008, "learning_rate": 2.7774851482656195e-05, "loss": 2.5195, "step": 2129000 }, { "epoch": 4.457706735438926, "grad_norm": 15.36168098449707, "learning_rate": 2.7769607226165906e-05, "loss": 2.5215, "step": 2129500 }, { "epoch": 4.458753391164551, "grad_norm": 14.467397689819336, "learning_rate": 2.776436296967561e-05, "loss": 2.5256, "step": 2130000 }, { "epoch": 4.459800046890177, "grad_norm": 13.447651863098145, "learning_rate": 2.7759118713185322e-05, "loss": 2.5194, "step": 2130500 }, { "epoch": 4.460846702615802, "grad_norm": 13.721724510192871, "learning_rate": 2.775387445669503e-05, "loss": 2.543, "step": 2131000 }, { "epoch": 4.4618933583414275, "grad_norm": 14.774055480957031, "learning_rate": 2.774863020020474e-05, "loss": 2.5326, "step": 2131500 }, { "epoch": 4.462940014067053, "grad_norm": 16.121980667114258, "learning_rate": 2.7743385943714445e-05, "loss": 2.542, "step": 2132000 }, { "epoch": 4.463986669792678, "grad_norm": 19.234771728515625, "learning_rate": 2.7738141687224156e-05, "loss": 2.5391, "step": 2132500 }, { "epoch": 4.465033325518304, "grad_norm": 14.339823722839355, "learning_rate": 2.773289743073386e-05, "loss": 2.548, "step": 2133000 }, { "epoch": 4.466079981243929, "grad_norm": 15.786249160766602, "learning_rate": 2.7727653174243568e-05, "loss": 2.5199, "step": 2133500 }, { "epoch": 4.467126636969555, "grad_norm": 15.188830375671387, "learning_rate": 2.772240891775328e-05, "loss": 2.5489, "step": 2134000 }, { "epoch": 4.46817329269518, "grad_norm": 14.983784675598145, "learning_rate": 2.7717164661262983e-05, "loss": 2.5277, "step": 2134500 }, { "epoch": 4.469219948420806, "grad_norm": 14.357185363769531, "learning_rate": 2.7711920404772694e-05, "loss": 2.5321, "step": 2135000 }, { "epoch": 4.470266604146431, "grad_norm": 13.548791885375977, "learning_rate": 2.7706676148282402e-05, "loss": 2.5355, "step": 2135500 }, { "epoch": 4.471313259872057, "grad_norm": 15.317741394042969, "learning_rate": 2.7701431891792113e-05, "loss": 2.5175, "step": 2136000 }, { "epoch": 4.472359915597682, "grad_norm": 15.372045516967773, "learning_rate": 2.7696187635301818e-05, "loss": 2.5294, "step": 2136500 }, { "epoch": 4.4734065713233075, "grad_norm": 13.996832847595215, "learning_rate": 2.769094337881153e-05, "loss": 2.5503, "step": 2137000 }, { "epoch": 4.474453227048933, "grad_norm": 13.80891227722168, "learning_rate": 2.7685699122321236e-05, "loss": 2.5278, "step": 2137500 }, { "epoch": 4.4754998827745585, "grad_norm": 17.23287582397461, "learning_rate": 2.7680454865830947e-05, "loss": 2.5348, "step": 2138000 }, { "epoch": 4.476546538500184, "grad_norm": 15.167581558227539, "learning_rate": 2.767521060934065e-05, "loss": 2.5306, "step": 2138500 }, { "epoch": 4.477593194225809, "grad_norm": 14.137149810791016, "learning_rate": 2.7669966352850356e-05, "loss": 2.5413, "step": 2139000 }, { "epoch": 4.478639849951435, "grad_norm": 14.385046005249023, "learning_rate": 2.7664722096360067e-05, "loss": 2.5432, "step": 2139500 }, { "epoch": 4.47968650567706, "grad_norm": 16.88346290588379, "learning_rate": 2.7659477839869775e-05, "loss": 2.5283, "step": 2140000 }, { "epoch": 4.480733161402686, "grad_norm": 14.572808265686035, "learning_rate": 2.7654233583379486e-05, "loss": 2.5553, "step": 2140500 }, { "epoch": 4.481779817128311, "grad_norm": 15.316609382629395, "learning_rate": 2.764898932688919e-05, "loss": 2.5354, "step": 2141000 }, { "epoch": 4.482826472853937, "grad_norm": 14.698931694030762, "learning_rate": 2.76437450703989e-05, "loss": 2.5166, "step": 2141500 }, { "epoch": 4.483873128579563, "grad_norm": 13.86058235168457, "learning_rate": 2.763850081390861e-05, "loss": 2.5273, "step": 2142000 }, { "epoch": 4.4849197843051885, "grad_norm": 18.369661331176758, "learning_rate": 2.763325655741832e-05, "loss": 2.5338, "step": 2142500 }, { "epoch": 4.485966440030814, "grad_norm": 15.728021621704102, "learning_rate": 2.7628012300928024e-05, "loss": 2.5332, "step": 2143000 }, { "epoch": 4.487013095756439, "grad_norm": 16.089733123779297, "learning_rate": 2.7622768044437735e-05, "loss": 2.5377, "step": 2143500 }, { "epoch": 4.488059751482065, "grad_norm": 16.424516677856445, "learning_rate": 2.761752378794744e-05, "loss": 2.5508, "step": 2144000 }, { "epoch": 4.48910640720769, "grad_norm": 16.139665603637695, "learning_rate": 2.7612279531457147e-05, "loss": 2.5309, "step": 2144500 }, { "epoch": 4.490153062933316, "grad_norm": 12.817242622375488, "learning_rate": 2.760703527496686e-05, "loss": 2.531, "step": 2145000 }, { "epoch": 4.491199718658941, "grad_norm": 15.620709419250488, "learning_rate": 2.7601791018476563e-05, "loss": 2.5271, "step": 2145500 }, { "epoch": 4.492246374384567, "grad_norm": 14.106118202209473, "learning_rate": 2.7596546761986274e-05, "loss": 2.54, "step": 2146000 }, { "epoch": 4.493293030110192, "grad_norm": 14.775148391723633, "learning_rate": 2.759130250549598e-05, "loss": 2.5337, "step": 2146500 }, { "epoch": 4.4943396858358176, "grad_norm": 14.047466278076172, "learning_rate": 2.7586058249005693e-05, "loss": 2.5303, "step": 2147000 }, { "epoch": 4.495386341561443, "grad_norm": 14.978745460510254, "learning_rate": 2.7580813992515397e-05, "loss": 2.5231, "step": 2147500 }, { "epoch": 4.4964329972870685, "grad_norm": 20.58027458190918, "learning_rate": 2.7575569736025108e-05, "loss": 2.5336, "step": 2148000 }, { "epoch": 4.497479653012694, "grad_norm": 13.880894660949707, "learning_rate": 2.7570325479534816e-05, "loss": 2.545, "step": 2148500 }, { "epoch": 4.498526308738319, "grad_norm": 14.732693672180176, "learning_rate": 2.7565081223044527e-05, "loss": 2.5224, "step": 2149000 }, { "epoch": 4.499572964463945, "grad_norm": 16.037246704101562, "learning_rate": 2.755983696655423e-05, "loss": 2.5172, "step": 2149500 }, { "epoch": 4.50061962018957, "grad_norm": 13.899301528930664, "learning_rate": 2.7554592710063935e-05, "loss": 2.5305, "step": 2150000 }, { "epoch": 4.501666275915196, "grad_norm": 12.67846393585205, "learning_rate": 2.7549348453573646e-05, "loss": 2.5223, "step": 2150500 }, { "epoch": 4.502712931640821, "grad_norm": 17.72030258178711, "learning_rate": 2.7544104197083354e-05, "loss": 2.5187, "step": 2151000 }, { "epoch": 4.503759587366447, "grad_norm": 13.654911994934082, "learning_rate": 2.7538859940593065e-05, "loss": 2.5298, "step": 2151500 }, { "epoch": 4.504806243092072, "grad_norm": 15.979862213134766, "learning_rate": 2.753361568410277e-05, "loss": 2.5352, "step": 2152000 }, { "epoch": 4.505852898817698, "grad_norm": 18.977602005004883, "learning_rate": 2.752837142761248e-05, "loss": 2.5332, "step": 2152500 }, { "epoch": 4.506899554543323, "grad_norm": 15.818939208984375, "learning_rate": 2.7523127171122188e-05, "loss": 2.5446, "step": 2153000 }, { "epoch": 4.5079462102689485, "grad_norm": 15.435956001281738, "learning_rate": 2.75178829146319e-05, "loss": 2.5371, "step": 2153500 }, { "epoch": 4.508992865994574, "grad_norm": 14.850310325622559, "learning_rate": 2.7512638658141604e-05, "loss": 2.5211, "step": 2154000 }, { "epoch": 4.510039521720199, "grad_norm": 17.94774055480957, "learning_rate": 2.7507394401651315e-05, "loss": 2.5068, "step": 2154500 }, { "epoch": 4.511086177445825, "grad_norm": 13.696747779846191, "learning_rate": 2.7502150145161022e-05, "loss": 2.5109, "step": 2155000 }, { "epoch": 4.51213283317145, "grad_norm": 14.531420707702637, "learning_rate": 2.7496905888670727e-05, "loss": 2.5418, "step": 2155500 }, { "epoch": 4.513179488897076, "grad_norm": 15.111370086669922, "learning_rate": 2.7491661632180438e-05, "loss": 2.5323, "step": 2156000 }, { "epoch": 4.514226144622701, "grad_norm": 13.961689949035645, "learning_rate": 2.7486417375690142e-05, "loss": 2.5212, "step": 2156500 }, { "epoch": 4.515272800348327, "grad_norm": 16.526735305786133, "learning_rate": 2.7481173119199853e-05, "loss": 2.5446, "step": 2157000 }, { "epoch": 4.516319456073952, "grad_norm": 13.72460651397705, "learning_rate": 2.747592886270956e-05, "loss": 2.5398, "step": 2157500 }, { "epoch": 4.5173661117995785, "grad_norm": 15.70749568939209, "learning_rate": 2.7470684606219272e-05, "loss": 2.5395, "step": 2158000 }, { "epoch": 4.518412767525204, "grad_norm": 13.180347442626953, "learning_rate": 2.7465440349728976e-05, "loss": 2.5393, "step": 2158500 }, { "epoch": 4.519459423250829, "grad_norm": 15.201114654541016, "learning_rate": 2.7460196093238687e-05, "loss": 2.5091, "step": 2159000 }, { "epoch": 4.520506078976455, "grad_norm": 14.978913307189941, "learning_rate": 2.7454951836748395e-05, "loss": 2.513, "step": 2159500 }, { "epoch": 4.52155273470208, "grad_norm": 14.70405387878418, "learning_rate": 2.7449707580258106e-05, "loss": 2.5499, "step": 2160000 }, { "epoch": 4.522599390427706, "grad_norm": 18.385665893554688, "learning_rate": 2.744446332376781e-05, "loss": 2.5258, "step": 2160500 }, { "epoch": 4.523646046153331, "grad_norm": 15.754669189453125, "learning_rate": 2.7439219067277515e-05, "loss": 2.5013, "step": 2161000 }, { "epoch": 4.524692701878957, "grad_norm": 14.65768814086914, "learning_rate": 2.7433974810787226e-05, "loss": 2.5223, "step": 2161500 }, { "epoch": 4.525739357604582, "grad_norm": 16.26030921936035, "learning_rate": 2.7428730554296933e-05, "loss": 2.5347, "step": 2162000 }, { "epoch": 4.526786013330208, "grad_norm": 13.603789329528809, "learning_rate": 2.7423486297806645e-05, "loss": 2.5152, "step": 2162500 }, { "epoch": 4.527832669055833, "grad_norm": 14.480066299438477, "learning_rate": 2.741824204131635e-05, "loss": 2.5425, "step": 2163000 }, { "epoch": 4.5288793247814585, "grad_norm": 15.435148239135742, "learning_rate": 2.741299778482606e-05, "loss": 2.5271, "step": 2163500 }, { "epoch": 4.529925980507084, "grad_norm": 15.32526683807373, "learning_rate": 2.7407753528335768e-05, "loss": 2.5224, "step": 2164000 }, { "epoch": 4.530972636232709, "grad_norm": 14.497462272644043, "learning_rate": 2.740250927184548e-05, "loss": 2.525, "step": 2164500 }, { "epoch": 4.532019291958335, "grad_norm": 12.68690299987793, "learning_rate": 2.7397265015355183e-05, "loss": 2.5203, "step": 2165000 }, { "epoch": 4.53306594768396, "grad_norm": 14.292277336120605, "learning_rate": 2.7392020758864894e-05, "loss": 2.5399, "step": 2165500 }, { "epoch": 4.534112603409586, "grad_norm": 14.794736862182617, "learning_rate": 2.7386776502374602e-05, "loss": 2.5372, "step": 2166000 }, { "epoch": 4.535159259135211, "grad_norm": 15.395577430725098, "learning_rate": 2.7381532245884313e-05, "loss": 2.5418, "step": 2166500 }, { "epoch": 4.536205914860837, "grad_norm": 13.546442985534668, "learning_rate": 2.7376287989394017e-05, "loss": 2.5198, "step": 2167000 }, { "epoch": 4.537252570586462, "grad_norm": 14.171347618103027, "learning_rate": 2.737104373290372e-05, "loss": 2.5271, "step": 2167500 }, { "epoch": 4.538299226312088, "grad_norm": 14.56408405303955, "learning_rate": 2.7365799476413433e-05, "loss": 2.5268, "step": 2168000 }, { "epoch": 4.539345882037713, "grad_norm": 13.105595588684082, "learning_rate": 2.736055521992314e-05, "loss": 2.526, "step": 2168500 }, { "epoch": 4.5403925377633385, "grad_norm": 15.383212089538574, "learning_rate": 2.735531096343285e-05, "loss": 2.513, "step": 2169000 }, { "epoch": 4.541439193488964, "grad_norm": 15.220344543457031, "learning_rate": 2.7350066706942556e-05, "loss": 2.5264, "step": 2169500 }, { "epoch": 4.542485849214589, "grad_norm": 14.078306198120117, "learning_rate": 2.7344822450452267e-05, "loss": 2.5296, "step": 2170000 }, { "epoch": 4.543532504940215, "grad_norm": 17.141765594482422, "learning_rate": 2.7339578193961974e-05, "loss": 2.5396, "step": 2170500 }, { "epoch": 4.54457916066584, "grad_norm": 16.08547019958496, "learning_rate": 2.7334333937471685e-05, "loss": 2.5308, "step": 2171000 }, { "epoch": 4.545625816391466, "grad_norm": 16.7176570892334, "learning_rate": 2.732908968098139e-05, "loss": 2.5435, "step": 2171500 }, { "epoch": 4.546672472117091, "grad_norm": 13.797218322753906, "learning_rate": 2.73238454244911e-05, "loss": 2.5165, "step": 2172000 }, { "epoch": 4.547719127842717, "grad_norm": 19.481035232543945, "learning_rate": 2.731860116800081e-05, "loss": 2.5098, "step": 2172500 }, { "epoch": 4.548765783568342, "grad_norm": 15.744244575500488, "learning_rate": 2.7313356911510513e-05, "loss": 2.53, "step": 2173000 }, { "epoch": 4.549812439293968, "grad_norm": 14.822980880737305, "learning_rate": 2.7308112655020224e-05, "loss": 2.5213, "step": 2173500 }, { "epoch": 4.550859095019593, "grad_norm": 15.120344161987305, "learning_rate": 2.7302868398529928e-05, "loss": 2.546, "step": 2174000 }, { "epoch": 4.5519057507452185, "grad_norm": 13.170762062072754, "learning_rate": 2.729762414203964e-05, "loss": 2.5274, "step": 2174500 }, { "epoch": 4.552952406470844, "grad_norm": 16.358198165893555, "learning_rate": 2.7292379885549347e-05, "loss": 2.5238, "step": 2175000 }, { "epoch": 4.553999062196469, "grad_norm": 16.266834259033203, "learning_rate": 2.7287135629059058e-05, "loss": 2.5018, "step": 2175500 }, { "epoch": 4.555045717922095, "grad_norm": 13.097407341003418, "learning_rate": 2.7281891372568762e-05, "loss": 2.5192, "step": 2176000 }, { "epoch": 4.55609237364772, "grad_norm": 15.676533699035645, "learning_rate": 2.7276647116078473e-05, "loss": 2.5208, "step": 2176500 }, { "epoch": 4.557139029373346, "grad_norm": 13.577620506286621, "learning_rate": 2.727140285958818e-05, "loss": 2.5174, "step": 2177000 }, { "epoch": 4.558185685098972, "grad_norm": 15.389678001403809, "learning_rate": 2.7266158603097892e-05, "loss": 2.5234, "step": 2177500 }, { "epoch": 4.559232340824598, "grad_norm": 17.634702682495117, "learning_rate": 2.7260914346607597e-05, "loss": 2.5431, "step": 2178000 }, { "epoch": 4.560278996550223, "grad_norm": 15.71954345703125, "learning_rate": 2.72556700901173e-05, "loss": 2.5245, "step": 2178500 }, { "epoch": 4.5613256522758485, "grad_norm": 19.7834529876709, "learning_rate": 2.7250425833627012e-05, "loss": 2.5449, "step": 2179000 }, { "epoch": 4.562372308001474, "grad_norm": 15.190118789672852, "learning_rate": 2.724518157713672e-05, "loss": 2.5245, "step": 2179500 }, { "epoch": 4.563418963727099, "grad_norm": 14.090044021606445, "learning_rate": 2.723993732064643e-05, "loss": 2.5264, "step": 2180000 }, { "epoch": 4.564465619452725, "grad_norm": 15.547324180603027, "learning_rate": 2.7234693064156135e-05, "loss": 2.5137, "step": 2180500 }, { "epoch": 4.56551227517835, "grad_norm": 16.725492477416992, "learning_rate": 2.7229448807665846e-05, "loss": 2.5273, "step": 2181000 }, { "epoch": 4.566558930903976, "grad_norm": 13.874363899230957, "learning_rate": 2.7224204551175554e-05, "loss": 2.5249, "step": 2181500 }, { "epoch": 4.567605586629601, "grad_norm": 16.44306182861328, "learning_rate": 2.7218960294685265e-05, "loss": 2.5383, "step": 2182000 }, { "epoch": 4.568652242355227, "grad_norm": 12.920527458190918, "learning_rate": 2.721371603819497e-05, "loss": 2.5277, "step": 2182500 }, { "epoch": 4.569698898080852, "grad_norm": 17.24798011779785, "learning_rate": 2.720847178170468e-05, "loss": 2.5276, "step": 2183000 }, { "epoch": 4.570745553806478, "grad_norm": 17.01425552368164, "learning_rate": 2.7203227525214388e-05, "loss": 2.5325, "step": 2183500 }, { "epoch": 4.571792209532103, "grad_norm": 14.764466285705566, "learning_rate": 2.7197983268724092e-05, "loss": 2.5443, "step": 2184000 }, { "epoch": 4.5728388652577285, "grad_norm": 19.1207275390625, "learning_rate": 2.7192739012233803e-05, "loss": 2.5457, "step": 2184500 }, { "epoch": 4.573885520983354, "grad_norm": 17.292442321777344, "learning_rate": 2.7187494755743508e-05, "loss": 2.5181, "step": 2185000 }, { "epoch": 4.5749321767089794, "grad_norm": 19.80965805053711, "learning_rate": 2.718225049925322e-05, "loss": 2.5111, "step": 2185500 }, { "epoch": 4.575978832434605, "grad_norm": 13.54409122467041, "learning_rate": 2.7177006242762926e-05, "loss": 2.5106, "step": 2186000 }, { "epoch": 4.57702548816023, "grad_norm": 15.590231895446777, "learning_rate": 2.7171761986272637e-05, "loss": 2.5498, "step": 2186500 }, { "epoch": 4.578072143885856, "grad_norm": 14.772274017333984, "learning_rate": 2.7166517729782342e-05, "loss": 2.5129, "step": 2187000 }, { "epoch": 4.579118799611481, "grad_norm": 16.485212326049805, "learning_rate": 2.7161273473292053e-05, "loss": 2.5212, "step": 2187500 }, { "epoch": 4.580165455337107, "grad_norm": 15.943374633789062, "learning_rate": 2.715602921680176e-05, "loss": 2.5234, "step": 2188000 }, { "epoch": 4.581212111062732, "grad_norm": 17.37286376953125, "learning_rate": 2.715078496031147e-05, "loss": 2.5172, "step": 2188500 }, { "epoch": 4.582258766788358, "grad_norm": 14.483336448669434, "learning_rate": 2.7145540703821176e-05, "loss": 2.5169, "step": 2189000 }, { "epoch": 4.583305422513983, "grad_norm": 14.962327003479004, "learning_rate": 2.714029644733088e-05, "loss": 2.5236, "step": 2189500 }, { "epoch": 4.5843520782396086, "grad_norm": 27.41204833984375, "learning_rate": 2.7135052190840595e-05, "loss": 2.5142, "step": 2190000 }, { "epoch": 4.585398733965234, "grad_norm": 18.20020294189453, "learning_rate": 2.71298079343503e-05, "loss": 2.5276, "step": 2190500 }, { "epoch": 4.5864453896908595, "grad_norm": 14.310914993286133, "learning_rate": 2.712456367786001e-05, "loss": 2.5299, "step": 2191000 }, { "epoch": 4.587492045416485, "grad_norm": 17.558935165405273, "learning_rate": 2.7119319421369714e-05, "loss": 2.5516, "step": 2191500 }, { "epoch": 4.58853870114211, "grad_norm": 17.884075164794922, "learning_rate": 2.7114075164879425e-05, "loss": 2.5388, "step": 2192000 }, { "epoch": 4.589585356867736, "grad_norm": 15.08813190460205, "learning_rate": 2.7108830908389133e-05, "loss": 2.5327, "step": 2192500 }, { "epoch": 4.590632012593362, "grad_norm": 16.52048110961914, "learning_rate": 2.7103586651898844e-05, "loss": 2.5222, "step": 2193000 }, { "epoch": 4.591678668318988, "grad_norm": 15.963212966918945, "learning_rate": 2.709834239540855e-05, "loss": 2.5066, "step": 2193500 }, { "epoch": 4.592725324044613, "grad_norm": 15.29112720489502, "learning_rate": 2.709309813891826e-05, "loss": 2.5185, "step": 2194000 }, { "epoch": 4.5937719797702385, "grad_norm": 15.88613510131836, "learning_rate": 2.7087853882427967e-05, "loss": 2.5184, "step": 2194500 }, { "epoch": 4.594818635495864, "grad_norm": 14.061882972717285, "learning_rate": 2.708260962593767e-05, "loss": 2.5114, "step": 2195000 }, { "epoch": 4.5958652912214895, "grad_norm": 12.778717041015625, "learning_rate": 2.7077365369447383e-05, "loss": 2.5215, "step": 2195500 }, { "epoch": 4.596911946947115, "grad_norm": 14.233996391296387, "learning_rate": 2.7072121112957087e-05, "loss": 2.524, "step": 2196000 }, { "epoch": 4.59795860267274, "grad_norm": 15.763998985290527, "learning_rate": 2.7066876856466798e-05, "loss": 2.5201, "step": 2196500 }, { "epoch": 4.599005258398366, "grad_norm": 13.61730670928955, "learning_rate": 2.7061632599976506e-05, "loss": 2.5312, "step": 2197000 }, { "epoch": 4.600051914123991, "grad_norm": 14.045695304870605, "learning_rate": 2.7056388343486217e-05, "loss": 2.514, "step": 2197500 }, { "epoch": 4.601098569849617, "grad_norm": 15.56725025177002, "learning_rate": 2.705114408699592e-05, "loss": 2.5489, "step": 2198000 }, { "epoch": 4.602145225575242, "grad_norm": 14.75834846496582, "learning_rate": 2.7045899830505632e-05, "loss": 2.5366, "step": 2198500 }, { "epoch": 4.603191881300868, "grad_norm": 14.215961456298828, "learning_rate": 2.704065557401534e-05, "loss": 2.5395, "step": 2199000 }, { "epoch": 4.604238537026493, "grad_norm": 15.833538055419922, "learning_rate": 2.703541131752505e-05, "loss": 2.5214, "step": 2199500 }, { "epoch": 4.605285192752119, "grad_norm": 14.137104988098145, "learning_rate": 2.7030167061034755e-05, "loss": 2.4959, "step": 2200000 }, { "epoch": 4.606331848477744, "grad_norm": 16.7593994140625, "learning_rate": 2.7024922804544466e-05, "loss": 2.529, "step": 2200500 }, { "epoch": 4.6073785042033695, "grad_norm": 16.14056968688965, "learning_rate": 2.7019678548054174e-05, "loss": 2.5362, "step": 2201000 }, { "epoch": 4.608425159928995, "grad_norm": 16.827465057373047, "learning_rate": 2.701443429156388e-05, "loss": 2.5291, "step": 2201500 }, { "epoch": 4.60947181565462, "grad_norm": 16.69978904724121, "learning_rate": 2.700919003507359e-05, "loss": 2.5148, "step": 2202000 }, { "epoch": 4.610518471380246, "grad_norm": 18.436511993408203, "learning_rate": 2.7003945778583294e-05, "loss": 2.5271, "step": 2202500 }, { "epoch": 4.611565127105871, "grad_norm": 15.272010803222656, "learning_rate": 2.6998701522093005e-05, "loss": 2.5307, "step": 2203000 }, { "epoch": 4.612611782831497, "grad_norm": 16.196189880371094, "learning_rate": 2.6993457265602713e-05, "loss": 2.5395, "step": 2203500 }, { "epoch": 4.613658438557122, "grad_norm": 16.350330352783203, "learning_rate": 2.6988213009112424e-05, "loss": 2.5174, "step": 2204000 }, { "epoch": 4.614705094282748, "grad_norm": 15.147662162780762, "learning_rate": 2.6982968752622128e-05, "loss": 2.5377, "step": 2204500 }, { "epoch": 4.615751750008373, "grad_norm": 14.635889053344727, "learning_rate": 2.697772449613184e-05, "loss": 2.5222, "step": 2205000 }, { "epoch": 4.616798405733999, "grad_norm": 16.96011734008789, "learning_rate": 2.6972480239641547e-05, "loss": 2.5288, "step": 2205500 }, { "epoch": 4.617845061459624, "grad_norm": 16.3867130279541, "learning_rate": 2.6967235983151258e-05, "loss": 2.5197, "step": 2206000 }, { "epoch": 4.6188917171852495, "grad_norm": 14.827167510986328, "learning_rate": 2.6961991726660962e-05, "loss": 2.5207, "step": 2206500 }, { "epoch": 4.619938372910875, "grad_norm": 15.675827026367188, "learning_rate": 2.6956747470170666e-05, "loss": 2.5318, "step": 2207000 }, { "epoch": 4.6209850286365, "grad_norm": 14.634086608886719, "learning_rate": 2.6951503213680377e-05, "loss": 2.5206, "step": 2207500 }, { "epoch": 4.622031684362126, "grad_norm": 14.158090591430664, "learning_rate": 2.6946258957190085e-05, "loss": 2.5201, "step": 2208000 }, { "epoch": 4.623078340087751, "grad_norm": 13.38867473602295, "learning_rate": 2.6941014700699796e-05, "loss": 2.5156, "step": 2208500 }, { "epoch": 4.624124995813377, "grad_norm": 13.063448905944824, "learning_rate": 2.69357704442095e-05, "loss": 2.5145, "step": 2209000 }, { "epoch": 4.625171651539002, "grad_norm": 18.26426124572754, "learning_rate": 2.693052618771921e-05, "loss": 2.5387, "step": 2209500 }, { "epoch": 4.626218307264628, "grad_norm": 14.186735153198242, "learning_rate": 2.692528193122892e-05, "loss": 2.5244, "step": 2210000 }, { "epoch": 4.627264962990253, "grad_norm": 21.488422393798828, "learning_rate": 2.692003767473863e-05, "loss": 2.5203, "step": 2210500 }, { "epoch": 4.628311618715879, "grad_norm": 16.32450294494629, "learning_rate": 2.6914793418248335e-05, "loss": 2.514, "step": 2211000 }, { "epoch": 4.629358274441504, "grad_norm": 15.603894233703613, "learning_rate": 2.6909549161758046e-05, "loss": 2.5103, "step": 2211500 }, { "epoch": 4.6304049301671295, "grad_norm": 18.03152847290039, "learning_rate": 2.6904304905267753e-05, "loss": 2.5203, "step": 2212000 }, { "epoch": 4.631451585892756, "grad_norm": 13.302985191345215, "learning_rate": 2.6899060648777458e-05, "loss": 2.5056, "step": 2212500 }, { "epoch": 4.632498241618381, "grad_norm": 15.167162895202637, "learning_rate": 2.689381639228717e-05, "loss": 2.5039, "step": 2213000 }, { "epoch": 4.633544897344007, "grad_norm": 14.610509872436523, "learning_rate": 2.6888572135796873e-05, "loss": 2.5207, "step": 2213500 }, { "epoch": 4.634591553069632, "grad_norm": 14.827693939208984, "learning_rate": 2.6883327879306584e-05, "loss": 2.5127, "step": 2214000 }, { "epoch": 4.635638208795258, "grad_norm": 14.34554386138916, "learning_rate": 2.6878083622816292e-05, "loss": 2.5292, "step": 2214500 }, { "epoch": 4.636684864520883, "grad_norm": 15.862000465393066, "learning_rate": 2.6872839366326003e-05, "loss": 2.5012, "step": 2215000 }, { "epoch": 4.637731520246509, "grad_norm": 13.496382713317871, "learning_rate": 2.6867595109835707e-05, "loss": 2.5271, "step": 2215500 }, { "epoch": 4.638778175972134, "grad_norm": 13.760891914367676, "learning_rate": 2.686235085334542e-05, "loss": 2.5096, "step": 2216000 }, { "epoch": 4.6398248316977595, "grad_norm": 15.687926292419434, "learning_rate": 2.6857106596855126e-05, "loss": 2.5262, "step": 2216500 }, { "epoch": 4.640871487423385, "grad_norm": 14.136921882629395, "learning_rate": 2.6851862340364837e-05, "loss": 2.5115, "step": 2217000 }, { "epoch": 4.64191814314901, "grad_norm": 15.213423728942871, "learning_rate": 2.684661808387454e-05, "loss": 2.5203, "step": 2217500 }, { "epoch": 4.642964798874636, "grad_norm": 13.630932807922363, "learning_rate": 2.684137382738425e-05, "loss": 2.5261, "step": 2218000 }, { "epoch": 4.644011454600261, "grad_norm": 13.6942777633667, "learning_rate": 2.683612957089396e-05, "loss": 2.5259, "step": 2218500 }, { "epoch": 4.645058110325887, "grad_norm": 16.7540340423584, "learning_rate": 2.6830885314403664e-05, "loss": 2.5191, "step": 2219000 }, { "epoch": 4.646104766051512, "grad_norm": 15.266504287719727, "learning_rate": 2.6825641057913376e-05, "loss": 2.499, "step": 2219500 }, { "epoch": 4.647151421777138, "grad_norm": 15.854622840881348, "learning_rate": 2.682039680142308e-05, "loss": 2.5184, "step": 2220000 }, { "epoch": 4.648198077502763, "grad_norm": 14.862586975097656, "learning_rate": 2.681515254493279e-05, "loss": 2.52, "step": 2220500 }, { "epoch": 4.649244733228389, "grad_norm": 15.919602394104004, "learning_rate": 2.68099082884425e-05, "loss": 2.5241, "step": 2221000 }, { "epoch": 4.650291388954014, "grad_norm": 19.28249740600586, "learning_rate": 2.680466403195221e-05, "loss": 2.5364, "step": 2221500 }, { "epoch": 4.6513380446796395, "grad_norm": 16.294504165649414, "learning_rate": 2.6799419775461914e-05, "loss": 2.496, "step": 2222000 }, { "epoch": 4.652384700405265, "grad_norm": 15.064724922180176, "learning_rate": 2.6794175518971625e-05, "loss": 2.5203, "step": 2222500 }, { "epoch": 4.65343135613089, "grad_norm": 16.59487533569336, "learning_rate": 2.6788931262481333e-05, "loss": 2.5121, "step": 2223000 }, { "epoch": 4.654478011856516, "grad_norm": 15.825385093688965, "learning_rate": 2.6783687005991037e-05, "loss": 2.5179, "step": 2223500 }, { "epoch": 4.655524667582141, "grad_norm": 14.985664367675781, "learning_rate": 2.6778442749500748e-05, "loss": 2.5273, "step": 2224000 }, { "epoch": 4.656571323307767, "grad_norm": 12.794907569885254, "learning_rate": 2.6773198493010452e-05, "loss": 2.5098, "step": 2224500 }, { "epoch": 4.657617979033392, "grad_norm": 14.099138259887695, "learning_rate": 2.6767954236520164e-05, "loss": 2.5164, "step": 2225000 }, { "epoch": 4.658664634759018, "grad_norm": 13.973749160766602, "learning_rate": 2.676270998002987e-05, "loss": 2.5269, "step": 2225500 }, { "epoch": 4.659711290484643, "grad_norm": 15.007107734680176, "learning_rate": 2.6757465723539582e-05, "loss": 2.5248, "step": 2226000 }, { "epoch": 4.660757946210269, "grad_norm": 15.31049633026123, "learning_rate": 2.6752221467049287e-05, "loss": 2.514, "step": 2226500 }, { "epoch": 4.661804601935894, "grad_norm": 17.634315490722656, "learning_rate": 2.6746977210558998e-05, "loss": 2.5458, "step": 2227000 }, { "epoch": 4.6628512576615195, "grad_norm": 14.49720573425293, "learning_rate": 2.6741732954068705e-05, "loss": 2.5251, "step": 2227500 }, { "epoch": 4.663897913387146, "grad_norm": 17.52928924560547, "learning_rate": 2.6736488697578416e-05, "loss": 2.5143, "step": 2228000 }, { "epoch": 4.664944569112771, "grad_norm": 15.277463912963867, "learning_rate": 2.673124444108812e-05, "loss": 2.5021, "step": 2228500 }, { "epoch": 4.665991224838397, "grad_norm": 17.03453254699707, "learning_rate": 2.6726000184597832e-05, "loss": 2.5219, "step": 2229000 }, { "epoch": 4.667037880564022, "grad_norm": 14.516613006591797, "learning_rate": 2.672075592810754e-05, "loss": 2.5372, "step": 2229500 }, { "epoch": 4.668084536289648, "grad_norm": 15.375031471252441, "learning_rate": 2.6715511671617244e-05, "loss": 2.5162, "step": 2230000 }, { "epoch": 4.669131192015273, "grad_norm": 15.297708511352539, "learning_rate": 2.6710267415126955e-05, "loss": 2.5327, "step": 2230500 }, { "epoch": 4.670177847740899, "grad_norm": 12.628450393676758, "learning_rate": 2.670502315863666e-05, "loss": 2.5225, "step": 2231000 }, { "epoch": 4.671224503466524, "grad_norm": 20.174312591552734, "learning_rate": 2.669977890214637e-05, "loss": 2.5181, "step": 2231500 }, { "epoch": 4.6722711591921495, "grad_norm": 14.217257499694824, "learning_rate": 2.6694534645656078e-05, "loss": 2.5044, "step": 2232000 }, { "epoch": 4.673317814917775, "grad_norm": 15.782289505004883, "learning_rate": 2.668929038916579e-05, "loss": 2.508, "step": 2232500 }, { "epoch": 4.6743644706434, "grad_norm": 16.459388732910156, "learning_rate": 2.6684046132675493e-05, "loss": 2.5288, "step": 2233000 }, { "epoch": 4.675411126369026, "grad_norm": 14.081459999084473, "learning_rate": 2.6678801876185204e-05, "loss": 2.5165, "step": 2233500 }, { "epoch": 4.676457782094651, "grad_norm": 15.236241340637207, "learning_rate": 2.6673557619694912e-05, "loss": 2.5095, "step": 2234000 }, { "epoch": 4.677504437820277, "grad_norm": 14.494884490966797, "learning_rate": 2.6668313363204623e-05, "loss": 2.5435, "step": 2234500 }, { "epoch": 4.678551093545902, "grad_norm": 16.771793365478516, "learning_rate": 2.6663069106714328e-05, "loss": 2.5241, "step": 2235000 }, { "epoch": 4.679597749271528, "grad_norm": 14.866591453552246, "learning_rate": 2.6657824850224032e-05, "loss": 2.5176, "step": 2235500 }, { "epoch": 4.680644404997153, "grad_norm": 16.44784164428711, "learning_rate": 2.6652580593733746e-05, "loss": 2.5148, "step": 2236000 }, { "epoch": 4.681691060722779, "grad_norm": 13.53327751159668, "learning_rate": 2.664733633724345e-05, "loss": 2.5148, "step": 2236500 }, { "epoch": 4.682737716448404, "grad_norm": 16.1954402923584, "learning_rate": 2.6642092080753162e-05, "loss": 2.5053, "step": 2237000 }, { "epoch": 4.6837843721740295, "grad_norm": 13.352803230285645, "learning_rate": 2.6636847824262866e-05, "loss": 2.5091, "step": 2237500 }, { "epoch": 4.684831027899655, "grad_norm": 16.634944915771484, "learning_rate": 2.6631603567772577e-05, "loss": 2.5087, "step": 2238000 }, { "epoch": 4.6858776836252805, "grad_norm": 17.400306701660156, "learning_rate": 2.6626359311282285e-05, "loss": 2.5208, "step": 2238500 }, { "epoch": 4.686924339350906, "grad_norm": 15.047417640686035, "learning_rate": 2.6621115054791996e-05, "loss": 2.5215, "step": 2239000 }, { "epoch": 4.687970995076531, "grad_norm": 14.2282075881958, "learning_rate": 2.66158707983017e-05, "loss": 2.5152, "step": 2239500 }, { "epoch": 4.689017650802157, "grad_norm": 14.272035598754883, "learning_rate": 2.661062654181141e-05, "loss": 2.5186, "step": 2240000 }, { "epoch": 4.690064306527782, "grad_norm": 14.704182624816895, "learning_rate": 2.660538228532112e-05, "loss": 2.5241, "step": 2240500 }, { "epoch": 4.691110962253408, "grad_norm": 16.051740646362305, "learning_rate": 2.6600138028830823e-05, "loss": 2.5297, "step": 2241000 }, { "epoch": 4.692157617979033, "grad_norm": 18.75470542907715, "learning_rate": 2.6594893772340534e-05, "loss": 2.5311, "step": 2241500 }, { "epoch": 4.693204273704659, "grad_norm": 15.210488319396973, "learning_rate": 2.658964951585024e-05, "loss": 2.5204, "step": 2242000 }, { "epoch": 4.694250929430284, "grad_norm": 15.541960716247559, "learning_rate": 2.658440525935995e-05, "loss": 2.5166, "step": 2242500 }, { "epoch": 4.69529758515591, "grad_norm": 14.780083656311035, "learning_rate": 2.6579161002869657e-05, "loss": 2.5369, "step": 2243000 }, { "epoch": 4.696344240881535, "grad_norm": 16.894445419311523, "learning_rate": 2.657391674637937e-05, "loss": 2.5189, "step": 2243500 }, { "epoch": 4.6973908966071605, "grad_norm": 14.715312004089355, "learning_rate": 2.6568672489889073e-05, "loss": 2.5206, "step": 2244000 }, { "epoch": 4.698437552332786, "grad_norm": 15.220630645751953, "learning_rate": 2.6563428233398784e-05, "loss": 2.5353, "step": 2244500 }, { "epoch": 4.699484208058411, "grad_norm": 14.312482833862305, "learning_rate": 2.655818397690849e-05, "loss": 2.517, "step": 2245000 }, { "epoch": 4.700530863784037, "grad_norm": 14.22808837890625, "learning_rate": 2.6552939720418203e-05, "loss": 2.5257, "step": 2245500 }, { "epoch": 4.701577519509662, "grad_norm": 16.87246322631836, "learning_rate": 2.6547695463927907e-05, "loss": 2.5294, "step": 2246000 }, { "epoch": 4.702624175235288, "grad_norm": 14.172266006469727, "learning_rate": 2.6542451207437615e-05, "loss": 2.5069, "step": 2246500 }, { "epoch": 4.703670830960913, "grad_norm": 14.076399803161621, "learning_rate": 2.6537206950947326e-05, "loss": 2.5185, "step": 2247000 }, { "epoch": 4.7047174866865396, "grad_norm": 14.949396133422852, "learning_rate": 2.653196269445703e-05, "loss": 2.5217, "step": 2247500 }, { "epoch": 4.705764142412165, "grad_norm": 17.528854370117188, "learning_rate": 2.652671843796674e-05, "loss": 2.5293, "step": 2248000 }, { "epoch": 4.7068107981377905, "grad_norm": 14.986572265625, "learning_rate": 2.6521474181476445e-05, "loss": 2.501, "step": 2248500 }, { "epoch": 4.707857453863416, "grad_norm": 17.193788528442383, "learning_rate": 2.6516229924986156e-05, "loss": 2.5054, "step": 2249000 }, { "epoch": 4.708904109589041, "grad_norm": 15.524441719055176, "learning_rate": 2.6510985668495864e-05, "loss": 2.5423, "step": 2249500 }, { "epoch": 4.709950765314667, "grad_norm": 16.05012321472168, "learning_rate": 2.6505741412005575e-05, "loss": 2.5255, "step": 2250000 }, { "epoch": 4.710997421040292, "grad_norm": 13.858975410461426, "learning_rate": 2.650049715551528e-05, "loss": 2.5134, "step": 2250500 }, { "epoch": 4.712044076765918, "grad_norm": 21.794328689575195, "learning_rate": 2.649525289902499e-05, "loss": 2.5033, "step": 2251000 }, { "epoch": 4.713090732491543, "grad_norm": 13.605498313903809, "learning_rate": 2.6490008642534698e-05, "loss": 2.5229, "step": 2251500 }, { "epoch": 4.714137388217169, "grad_norm": 18.887548446655273, "learning_rate": 2.6484764386044403e-05, "loss": 2.5436, "step": 2252000 }, { "epoch": 4.715184043942794, "grad_norm": 14.594831466674805, "learning_rate": 2.6479520129554114e-05, "loss": 2.5152, "step": 2252500 }, { "epoch": 4.71623069966842, "grad_norm": 13.535501480102539, "learning_rate": 2.6474275873063818e-05, "loss": 2.5188, "step": 2253000 }, { "epoch": 4.717277355394045, "grad_norm": 12.543516159057617, "learning_rate": 2.6469031616573532e-05, "loss": 2.5224, "step": 2253500 }, { "epoch": 4.7183240111196705, "grad_norm": 17.523714065551758, "learning_rate": 2.6463787360083237e-05, "loss": 2.5221, "step": 2254000 }, { "epoch": 4.719370666845296, "grad_norm": 14.83873176574707, "learning_rate": 2.6458543103592948e-05, "loss": 2.5225, "step": 2254500 }, { "epoch": 4.720417322570921, "grad_norm": 15.968240737915039, "learning_rate": 2.6453298847102652e-05, "loss": 2.5276, "step": 2255000 }, { "epoch": 4.721463978296547, "grad_norm": 15.360346794128418, "learning_rate": 2.6448054590612363e-05, "loss": 2.5246, "step": 2255500 }, { "epoch": 4.722510634022172, "grad_norm": 14.428658485412598, "learning_rate": 2.644281033412207e-05, "loss": 2.515, "step": 2256000 }, { "epoch": 4.723557289747798, "grad_norm": 12.303945541381836, "learning_rate": 2.6437566077631782e-05, "loss": 2.5219, "step": 2256500 }, { "epoch": 4.724603945473423, "grad_norm": 15.968294143676758, "learning_rate": 2.6432321821141486e-05, "loss": 2.5209, "step": 2257000 }, { "epoch": 4.725650601199049, "grad_norm": 17.2413272857666, "learning_rate": 2.6427077564651194e-05, "loss": 2.5234, "step": 2257500 }, { "epoch": 4.726697256924674, "grad_norm": 14.099103927612305, "learning_rate": 2.6421833308160905e-05, "loss": 2.517, "step": 2258000 }, { "epoch": 4.7277439126503, "grad_norm": 15.927361488342285, "learning_rate": 2.641658905167061e-05, "loss": 2.5131, "step": 2258500 }, { "epoch": 4.728790568375925, "grad_norm": 15.866120338439941, "learning_rate": 2.641134479518032e-05, "loss": 2.5113, "step": 2259000 }, { "epoch": 4.7298372241015505, "grad_norm": 16.16792106628418, "learning_rate": 2.6406100538690025e-05, "loss": 2.5134, "step": 2259500 }, { "epoch": 4.730883879827176, "grad_norm": 13.07205581665039, "learning_rate": 2.6400856282199736e-05, "loss": 2.5289, "step": 2260000 }, { "epoch": 4.731930535552801, "grad_norm": 14.162617683410645, "learning_rate": 2.6395612025709444e-05, "loss": 2.5243, "step": 2260500 }, { "epoch": 4.732977191278427, "grad_norm": 15.194743156433105, "learning_rate": 2.6390367769219155e-05, "loss": 2.5168, "step": 2261000 }, { "epoch": 4.734023847004052, "grad_norm": 14.590592384338379, "learning_rate": 2.638512351272886e-05, "loss": 2.5116, "step": 2261500 }, { "epoch": 4.735070502729678, "grad_norm": 14.16331672668457, "learning_rate": 2.637987925623857e-05, "loss": 2.506, "step": 2262000 }, { "epoch": 4.736117158455303, "grad_norm": 29.374650955200195, "learning_rate": 2.6374634999748278e-05, "loss": 2.5083, "step": 2262500 }, { "epoch": 4.73716381418093, "grad_norm": 14.420379638671875, "learning_rate": 2.636939074325799e-05, "loss": 2.5239, "step": 2263000 }, { "epoch": 4.738210469906555, "grad_norm": 15.857884407043457, "learning_rate": 2.6364146486767693e-05, "loss": 2.5141, "step": 2263500 }, { "epoch": 4.7392571256321805, "grad_norm": 14.474516868591309, "learning_rate": 2.63589022302774e-05, "loss": 2.5107, "step": 2264000 }, { "epoch": 4.740303781357806, "grad_norm": 13.727346420288086, "learning_rate": 2.6353657973787112e-05, "loss": 2.526, "step": 2264500 }, { "epoch": 4.741350437083431, "grad_norm": 16.181299209594727, "learning_rate": 2.6348413717296816e-05, "loss": 2.5341, "step": 2265000 }, { "epoch": 4.742397092809057, "grad_norm": 15.453680992126465, "learning_rate": 2.6343169460806527e-05, "loss": 2.5008, "step": 2265500 }, { "epoch": 4.743443748534682, "grad_norm": 15.084554672241211, "learning_rate": 2.633792520431623e-05, "loss": 2.5248, "step": 2266000 }, { "epoch": 4.744490404260308, "grad_norm": 15.673442840576172, "learning_rate": 2.6332680947825943e-05, "loss": 2.5163, "step": 2266500 }, { "epoch": 4.745537059985933, "grad_norm": 15.067231178283691, "learning_rate": 2.632743669133565e-05, "loss": 2.5359, "step": 2267000 }, { "epoch": 4.746583715711559, "grad_norm": 15.121378898620605, "learning_rate": 2.632219243484536e-05, "loss": 2.5015, "step": 2267500 }, { "epoch": 4.747630371437184, "grad_norm": 14.541014671325684, "learning_rate": 2.6316948178355066e-05, "loss": 2.509, "step": 2268000 }, { "epoch": 4.74867702716281, "grad_norm": 13.526613235473633, "learning_rate": 2.6311703921864777e-05, "loss": 2.4994, "step": 2268500 }, { "epoch": 4.749723682888435, "grad_norm": 15.327522277832031, "learning_rate": 2.6306459665374484e-05, "loss": 2.5116, "step": 2269000 }, { "epoch": 4.7507703386140605, "grad_norm": 14.25658893585205, "learning_rate": 2.630121540888419e-05, "loss": 2.5184, "step": 2269500 }, { "epoch": 4.751816994339686, "grad_norm": 13.979101181030273, "learning_rate": 2.62959711523939e-05, "loss": 2.4986, "step": 2270000 }, { "epoch": 4.752863650065311, "grad_norm": 12.121068954467773, "learning_rate": 2.6290726895903604e-05, "loss": 2.492, "step": 2270500 }, { "epoch": 4.753910305790937, "grad_norm": 15.654690742492676, "learning_rate": 2.628548263941332e-05, "loss": 2.5192, "step": 2271000 }, { "epoch": 4.754956961516562, "grad_norm": 14.607341766357422, "learning_rate": 2.6280238382923023e-05, "loss": 2.5118, "step": 2271500 }, { "epoch": 4.756003617242188, "grad_norm": 14.18625545501709, "learning_rate": 2.6274994126432734e-05, "loss": 2.5202, "step": 2272000 }, { "epoch": 4.757050272967813, "grad_norm": 16.02972412109375, "learning_rate": 2.6269749869942438e-05, "loss": 2.5055, "step": 2272500 }, { "epoch": 4.758096928693439, "grad_norm": 14.738447189331055, "learning_rate": 2.626450561345215e-05, "loss": 2.5148, "step": 2273000 }, { "epoch": 4.759143584419064, "grad_norm": 11.218700408935547, "learning_rate": 2.6259261356961857e-05, "loss": 2.508, "step": 2273500 }, { "epoch": 4.76019024014469, "grad_norm": 13.839921951293945, "learning_rate": 2.6254017100471568e-05, "loss": 2.5093, "step": 2274000 }, { "epoch": 4.761236895870315, "grad_norm": 16.211326599121094, "learning_rate": 2.6248772843981272e-05, "loss": 2.4926, "step": 2274500 }, { "epoch": 4.7622835515959405, "grad_norm": 13.438003540039062, "learning_rate": 2.624352858749098e-05, "loss": 2.5144, "step": 2275000 }, { "epoch": 4.763330207321566, "grad_norm": 16.730432510375977, "learning_rate": 2.623828433100069e-05, "loss": 2.5299, "step": 2275500 }, { "epoch": 4.764376863047191, "grad_norm": 16.54891586303711, "learning_rate": 2.6233040074510395e-05, "loss": 2.5263, "step": 2276000 }, { "epoch": 4.765423518772817, "grad_norm": 15.259262084960938, "learning_rate": 2.6227795818020107e-05, "loss": 2.5118, "step": 2276500 }, { "epoch": 4.766470174498442, "grad_norm": 12.407095909118652, "learning_rate": 2.622255156152981e-05, "loss": 2.5051, "step": 2277000 }, { "epoch": 4.767516830224068, "grad_norm": 15.792618751525879, "learning_rate": 2.6217307305039522e-05, "loss": 2.4982, "step": 2277500 }, { "epoch": 4.768563485949693, "grad_norm": 15.398799896240234, "learning_rate": 2.621206304854923e-05, "loss": 2.5277, "step": 2278000 }, { "epoch": 4.769610141675319, "grad_norm": 14.349798202514648, "learning_rate": 2.620681879205894e-05, "loss": 2.5124, "step": 2278500 }, { "epoch": 4.770656797400944, "grad_norm": 13.336326599121094, "learning_rate": 2.6201574535568645e-05, "loss": 2.5176, "step": 2279000 }, { "epoch": 4.77170345312657, "grad_norm": 16.308849334716797, "learning_rate": 2.6196330279078356e-05, "loss": 2.5191, "step": 2279500 }, { "epoch": 4.772750108852195, "grad_norm": 19.045997619628906, "learning_rate": 2.6191086022588064e-05, "loss": 2.4996, "step": 2280000 }, { "epoch": 4.7737967645778205, "grad_norm": 15.944437980651855, "learning_rate": 2.6185841766097768e-05, "loss": 2.4943, "step": 2280500 }, { "epoch": 4.774843420303446, "grad_norm": 14.494803428649902, "learning_rate": 2.618059750960748e-05, "loss": 2.486, "step": 2281000 }, { "epoch": 4.7758900760290715, "grad_norm": 17.567121505737305, "learning_rate": 2.6175353253117187e-05, "loss": 2.5213, "step": 2281500 }, { "epoch": 4.776936731754697, "grad_norm": 15.867685317993164, "learning_rate": 2.6170108996626898e-05, "loss": 2.4962, "step": 2282000 }, { "epoch": 4.777983387480323, "grad_norm": 12.769775390625, "learning_rate": 2.6164864740136602e-05, "loss": 2.5076, "step": 2282500 }, { "epoch": 4.779030043205949, "grad_norm": 16.421905517578125, "learning_rate": 2.6159620483646313e-05, "loss": 2.5166, "step": 2283000 }, { "epoch": 4.780076698931574, "grad_norm": 18.225173950195312, "learning_rate": 2.6154376227156018e-05, "loss": 2.5154, "step": 2283500 }, { "epoch": 4.7811233546572, "grad_norm": 16.898422241210938, "learning_rate": 2.614913197066573e-05, "loss": 2.5148, "step": 2284000 }, { "epoch": 4.782170010382825, "grad_norm": 15.964264869689941, "learning_rate": 2.6143887714175436e-05, "loss": 2.5228, "step": 2284500 }, { "epoch": 4.7832166661084505, "grad_norm": 15.748283386230469, "learning_rate": 2.6138643457685147e-05, "loss": 2.5175, "step": 2285000 }, { "epoch": 4.784263321834076, "grad_norm": 14.833762168884277, "learning_rate": 2.6133399201194852e-05, "loss": 2.5164, "step": 2285500 }, { "epoch": 4.7853099775597014, "grad_norm": 15.205721855163574, "learning_rate": 2.612815494470456e-05, "loss": 2.5175, "step": 2286000 }, { "epoch": 4.786356633285327, "grad_norm": 16.32640838623047, "learning_rate": 2.612291068821427e-05, "loss": 2.5063, "step": 2286500 }, { "epoch": 4.787403289010952, "grad_norm": 15.271356582641602, "learning_rate": 2.6117666431723975e-05, "loss": 2.5139, "step": 2287000 }, { "epoch": 4.788449944736578, "grad_norm": 14.001429557800293, "learning_rate": 2.6112422175233686e-05, "loss": 2.5212, "step": 2287500 }, { "epoch": 4.789496600462203, "grad_norm": 14.577919960021973, "learning_rate": 2.610717791874339e-05, "loss": 2.4955, "step": 2288000 }, { "epoch": 4.790543256187829, "grad_norm": 15.21825885772705, "learning_rate": 2.61019336622531e-05, "loss": 2.5266, "step": 2288500 }, { "epoch": 4.791589911913454, "grad_norm": 14.651813507080078, "learning_rate": 2.609668940576281e-05, "loss": 2.5056, "step": 2289000 }, { "epoch": 4.79263656763908, "grad_norm": 14.137418746948242, "learning_rate": 2.609144514927252e-05, "loss": 2.5197, "step": 2289500 }, { "epoch": 4.793683223364705, "grad_norm": 17.309476852416992, "learning_rate": 2.6086200892782224e-05, "loss": 2.5188, "step": 2290000 }, { "epoch": 4.7947298790903305, "grad_norm": 15.774557113647461, "learning_rate": 2.6080956636291935e-05, "loss": 2.494, "step": 2290500 }, { "epoch": 4.795776534815956, "grad_norm": 16.51141357421875, "learning_rate": 2.6075712379801643e-05, "loss": 2.5122, "step": 2291000 }, { "epoch": 4.7968231905415815, "grad_norm": 15.583073616027832, "learning_rate": 2.6070468123311347e-05, "loss": 2.5097, "step": 2291500 }, { "epoch": 4.797869846267207, "grad_norm": 15.49920654296875, "learning_rate": 2.606522386682106e-05, "loss": 2.5264, "step": 2292000 }, { "epoch": 4.798916501992832, "grad_norm": 15.781996726989746, "learning_rate": 2.6059979610330766e-05, "loss": 2.5149, "step": 2292500 }, { "epoch": 4.799963157718458, "grad_norm": 16.818788528442383, "learning_rate": 2.6054735353840477e-05, "loss": 2.5147, "step": 2293000 }, { "epoch": 4.801009813444083, "grad_norm": 15.806897163391113, "learning_rate": 2.604949109735018e-05, "loss": 2.5081, "step": 2293500 }, { "epoch": 4.802056469169709, "grad_norm": 14.540919303894043, "learning_rate": 2.6044246840859893e-05, "loss": 2.5188, "step": 2294000 }, { "epoch": 4.803103124895334, "grad_norm": 13.80440902709961, "learning_rate": 2.6039002584369597e-05, "loss": 2.5086, "step": 2294500 }, { "epoch": 4.80414978062096, "grad_norm": 15.419036865234375, "learning_rate": 2.6033758327879308e-05, "loss": 2.5292, "step": 2295000 }, { "epoch": 4.805196436346585, "grad_norm": 19.466079711914062, "learning_rate": 2.6028514071389016e-05, "loss": 2.517, "step": 2295500 }, { "epoch": 4.806243092072211, "grad_norm": 15.497919082641602, "learning_rate": 2.6023269814898727e-05, "loss": 2.5174, "step": 2296000 }, { "epoch": 4.807289747797836, "grad_norm": 13.538152694702148, "learning_rate": 2.601802555840843e-05, "loss": 2.4988, "step": 2296500 }, { "epoch": 4.8083364035234615, "grad_norm": 16.25927734375, "learning_rate": 2.6012781301918142e-05, "loss": 2.5149, "step": 2297000 }, { "epoch": 4.809383059249087, "grad_norm": 15.901090621948242, "learning_rate": 2.600753704542785e-05, "loss": 2.5241, "step": 2297500 }, { "epoch": 4.810429714974713, "grad_norm": 14.159479141235352, "learning_rate": 2.6002292788937554e-05, "loss": 2.5117, "step": 2298000 }, { "epoch": 4.811476370700339, "grad_norm": 13.516219139099121, "learning_rate": 2.5997048532447265e-05, "loss": 2.4825, "step": 2298500 }, { "epoch": 4.812523026425964, "grad_norm": 15.638578414916992, "learning_rate": 2.5991804275956973e-05, "loss": 2.5032, "step": 2299000 }, { "epoch": 4.81356968215159, "grad_norm": 15.464669227600098, "learning_rate": 2.5986560019466684e-05, "loss": 2.4856, "step": 2299500 }, { "epoch": 4.814616337877215, "grad_norm": 17.260665893554688, "learning_rate": 2.598131576297639e-05, "loss": 2.5213, "step": 2300000 }, { "epoch": 4.815662993602841, "grad_norm": 17.24683380126953, "learning_rate": 2.59760715064861e-05, "loss": 2.5193, "step": 2300500 }, { "epoch": 4.816709649328466, "grad_norm": 14.842212677001953, "learning_rate": 2.5970827249995804e-05, "loss": 2.4942, "step": 2301000 }, { "epoch": 4.8177563050540915, "grad_norm": 13.943187713623047, "learning_rate": 2.5965582993505515e-05, "loss": 2.5392, "step": 2301500 }, { "epoch": 4.818802960779717, "grad_norm": 19.444194793701172, "learning_rate": 2.5960338737015223e-05, "loss": 2.5184, "step": 2302000 }, { "epoch": 4.819849616505342, "grad_norm": 15.99068546295166, "learning_rate": 2.5955094480524934e-05, "loss": 2.5113, "step": 2302500 }, { "epoch": 4.820896272230968, "grad_norm": 12.631073951721191, "learning_rate": 2.5949850224034638e-05, "loss": 2.5327, "step": 2303000 }, { "epoch": 4.821942927956593, "grad_norm": 15.634881973266602, "learning_rate": 2.5944605967544346e-05, "loss": 2.5225, "step": 2303500 }, { "epoch": 4.822989583682219, "grad_norm": 18.42340660095215, "learning_rate": 2.5939361711054057e-05, "loss": 2.5224, "step": 2304000 }, { "epoch": 4.824036239407844, "grad_norm": 14.238322257995605, "learning_rate": 2.593411745456376e-05, "loss": 2.4926, "step": 2304500 }, { "epoch": 4.82508289513347, "grad_norm": 15.151586532592773, "learning_rate": 2.5928873198073472e-05, "loss": 2.5125, "step": 2305000 }, { "epoch": 4.826129550859095, "grad_norm": 13.374802589416504, "learning_rate": 2.5923628941583176e-05, "loss": 2.4974, "step": 2305500 }, { "epoch": 4.827176206584721, "grad_norm": 15.146775245666504, "learning_rate": 2.5918384685092887e-05, "loss": 2.5085, "step": 2306000 }, { "epoch": 4.828222862310346, "grad_norm": 18.82506561279297, "learning_rate": 2.5913140428602595e-05, "loss": 2.5028, "step": 2306500 }, { "epoch": 4.8292695180359715, "grad_norm": 14.900164604187012, "learning_rate": 2.5907896172112306e-05, "loss": 2.4967, "step": 2307000 }, { "epoch": 4.830316173761597, "grad_norm": 17.09398651123047, "learning_rate": 2.590265191562201e-05, "loss": 2.4823, "step": 2307500 }, { "epoch": 4.831362829487222, "grad_norm": 17.97745132446289, "learning_rate": 2.589740765913172e-05, "loss": 2.5014, "step": 2308000 }, { "epoch": 4.832409485212848, "grad_norm": 13.265668869018555, "learning_rate": 2.589216340264143e-05, "loss": 2.5294, "step": 2308500 }, { "epoch": 4.833456140938473, "grad_norm": 21.99448585510254, "learning_rate": 2.5886919146151134e-05, "loss": 2.5017, "step": 2309000 }, { "epoch": 4.834502796664099, "grad_norm": 16.089672088623047, "learning_rate": 2.5881674889660845e-05, "loss": 2.5224, "step": 2309500 }, { "epoch": 4.835549452389724, "grad_norm": 15.878034591674805, "learning_rate": 2.5876430633170552e-05, "loss": 2.5108, "step": 2310000 }, { "epoch": 4.83659610811535, "grad_norm": 21.413433074951172, "learning_rate": 2.5871186376680263e-05, "loss": 2.4997, "step": 2310500 }, { "epoch": 4.837642763840975, "grad_norm": 14.214812278747559, "learning_rate": 2.5865942120189968e-05, "loss": 2.5015, "step": 2311000 }, { "epoch": 4.838689419566601, "grad_norm": 18.11577796936035, "learning_rate": 2.586069786369968e-05, "loss": 2.5041, "step": 2311500 }, { "epoch": 4.839736075292226, "grad_norm": 13.969826698303223, "learning_rate": 2.5855453607209383e-05, "loss": 2.5005, "step": 2312000 }, { "epoch": 4.8407827310178515, "grad_norm": 18.804332733154297, "learning_rate": 2.5850209350719094e-05, "loss": 2.515, "step": 2312500 }, { "epoch": 4.841829386743477, "grad_norm": 17.273080825805664, "learning_rate": 2.5844965094228802e-05, "loss": 2.5165, "step": 2313000 }, { "epoch": 4.842876042469102, "grad_norm": 14.795208930969238, "learning_rate": 2.5839720837738513e-05, "loss": 2.513, "step": 2313500 }, { "epoch": 4.843922698194728, "grad_norm": 15.766861915588379, "learning_rate": 2.5834476581248217e-05, "loss": 2.5058, "step": 2314000 }, { "epoch": 4.844969353920353, "grad_norm": 14.514077186584473, "learning_rate": 2.5829232324757925e-05, "loss": 2.5078, "step": 2314500 }, { "epoch": 4.846016009645979, "grad_norm": 12.188573837280273, "learning_rate": 2.5823988068267636e-05, "loss": 2.508, "step": 2315000 }, { "epoch": 4.847062665371604, "grad_norm": 13.934535026550293, "learning_rate": 2.581874381177734e-05, "loss": 2.5144, "step": 2315500 }, { "epoch": 4.84810932109723, "grad_norm": 16.866952896118164, "learning_rate": 2.581349955528705e-05, "loss": 2.5404, "step": 2316000 }, { "epoch": 4.849155976822855, "grad_norm": 13.154962539672852, "learning_rate": 2.5808255298796756e-05, "loss": 2.4951, "step": 2316500 }, { "epoch": 4.850202632548481, "grad_norm": 15.220425605773926, "learning_rate": 2.580301104230647e-05, "loss": 2.5064, "step": 2317000 }, { "epoch": 4.851249288274107, "grad_norm": 15.935646057128906, "learning_rate": 2.5797766785816175e-05, "loss": 2.5144, "step": 2317500 }, { "epoch": 4.852295943999732, "grad_norm": 18.439979553222656, "learning_rate": 2.5792522529325886e-05, "loss": 2.534, "step": 2318000 }, { "epoch": 4.853342599725358, "grad_norm": 15.207350730895996, "learning_rate": 2.578727827283559e-05, "loss": 2.5165, "step": 2318500 }, { "epoch": 4.854389255450983, "grad_norm": 13.644597053527832, "learning_rate": 2.57820340163453e-05, "loss": 2.4942, "step": 2319000 }, { "epoch": 4.855435911176609, "grad_norm": 15.570724487304688, "learning_rate": 2.577678975985501e-05, "loss": 2.5187, "step": 2319500 }, { "epoch": 4.856482566902234, "grad_norm": 18.13333511352539, "learning_rate": 2.5771545503364713e-05, "loss": 2.5008, "step": 2320000 }, { "epoch": 4.85752922262786, "grad_norm": 14.488420486450195, "learning_rate": 2.5766301246874424e-05, "loss": 2.5127, "step": 2320500 }, { "epoch": 4.858575878353485, "grad_norm": 16.796283721923828, "learning_rate": 2.5761056990384132e-05, "loss": 2.4936, "step": 2321000 }, { "epoch": 4.859622534079111, "grad_norm": 15.095870971679688, "learning_rate": 2.5755812733893843e-05, "loss": 2.5047, "step": 2321500 }, { "epoch": 4.860669189804736, "grad_norm": 15.754949569702148, "learning_rate": 2.5750568477403547e-05, "loss": 2.5071, "step": 2322000 }, { "epoch": 4.8617158455303615, "grad_norm": 18.46614646911621, "learning_rate": 2.5745324220913258e-05, "loss": 2.5206, "step": 2322500 }, { "epoch": 4.862762501255987, "grad_norm": 15.499850273132324, "learning_rate": 2.5740079964422963e-05, "loss": 2.4927, "step": 2323000 }, { "epoch": 4.863809156981612, "grad_norm": 15.737031936645508, "learning_rate": 2.5734835707932674e-05, "loss": 2.5032, "step": 2323500 }, { "epoch": 4.864855812707238, "grad_norm": 15.454397201538086, "learning_rate": 2.572959145144238e-05, "loss": 2.5026, "step": 2324000 }, { "epoch": 4.865902468432863, "grad_norm": 14.05437183380127, "learning_rate": 2.5724347194952092e-05, "loss": 2.5054, "step": 2324500 }, { "epoch": 4.866949124158489, "grad_norm": 17.929943084716797, "learning_rate": 2.5719102938461797e-05, "loss": 2.5008, "step": 2325000 }, { "epoch": 4.867995779884114, "grad_norm": 15.105259895324707, "learning_rate": 2.5713858681971504e-05, "loss": 2.4977, "step": 2325500 }, { "epoch": 4.86904243560974, "grad_norm": 13.230875968933105, "learning_rate": 2.5708614425481215e-05, "loss": 2.5105, "step": 2326000 }, { "epoch": 4.870089091335365, "grad_norm": 12.833391189575195, "learning_rate": 2.570337016899092e-05, "loss": 2.5147, "step": 2326500 }, { "epoch": 4.871135747060991, "grad_norm": 16.93454933166504, "learning_rate": 2.569812591250063e-05, "loss": 2.501, "step": 2327000 }, { "epoch": 4.872182402786616, "grad_norm": 16.031373977661133, "learning_rate": 2.569288165601034e-05, "loss": 2.5154, "step": 2327500 }, { "epoch": 4.8732290585122415, "grad_norm": 13.061933517456055, "learning_rate": 2.568763739952005e-05, "loss": 2.4908, "step": 2328000 }, { "epoch": 4.874275714237867, "grad_norm": 16.18990135192871, "learning_rate": 2.5682393143029754e-05, "loss": 2.5025, "step": 2328500 }, { "epoch": 4.875322369963492, "grad_norm": 15.885838508605957, "learning_rate": 2.5677148886539465e-05, "loss": 2.4999, "step": 2329000 }, { "epoch": 4.876369025689118, "grad_norm": 16.837200164794922, "learning_rate": 2.567190463004917e-05, "loss": 2.4974, "step": 2329500 }, { "epoch": 4.877415681414743, "grad_norm": 16.08378028869629, "learning_rate": 2.566666037355888e-05, "loss": 2.5116, "step": 2330000 }, { "epoch": 4.878462337140369, "grad_norm": 14.85832405090332, "learning_rate": 2.5661416117068588e-05, "loss": 2.4991, "step": 2330500 }, { "epoch": 4.879508992865994, "grad_norm": 14.294221878051758, "learning_rate": 2.56561718605783e-05, "loss": 2.5172, "step": 2331000 }, { "epoch": 4.88055564859162, "grad_norm": 15.511452674865723, "learning_rate": 2.5650927604088003e-05, "loss": 2.4957, "step": 2331500 }, { "epoch": 4.881602304317245, "grad_norm": 15.71838092803955, "learning_rate": 2.564568334759771e-05, "loss": 2.5291, "step": 2332000 }, { "epoch": 4.882648960042871, "grad_norm": 15.35044002532959, "learning_rate": 2.5640439091107422e-05, "loss": 2.49, "step": 2332500 }, { "epoch": 4.883695615768497, "grad_norm": 16.67942237854004, "learning_rate": 2.5635194834617127e-05, "loss": 2.515, "step": 2333000 }, { "epoch": 4.884742271494122, "grad_norm": 14.404597282409668, "learning_rate": 2.5629950578126838e-05, "loss": 2.5032, "step": 2333500 }, { "epoch": 4.885788927219748, "grad_norm": 17.211360931396484, "learning_rate": 2.5624706321636542e-05, "loss": 2.5093, "step": 2334000 }, { "epoch": 4.886835582945373, "grad_norm": 15.346117973327637, "learning_rate": 2.5619462065146256e-05, "loss": 2.5178, "step": 2334500 }, { "epoch": 4.887882238670999, "grad_norm": 15.769700050354004, "learning_rate": 2.561421780865596e-05, "loss": 2.5016, "step": 2335000 }, { "epoch": 4.888928894396624, "grad_norm": 13.797697067260742, "learning_rate": 2.5608973552165672e-05, "loss": 2.5173, "step": 2335500 }, { "epoch": 4.88997555012225, "grad_norm": 16.066465377807617, "learning_rate": 2.5603729295675376e-05, "loss": 2.5055, "step": 2336000 }, { "epoch": 4.891022205847875, "grad_norm": 16.313871383666992, "learning_rate": 2.5598485039185087e-05, "loss": 2.4969, "step": 2336500 }, { "epoch": 4.892068861573501, "grad_norm": 15.675226211547852, "learning_rate": 2.5593240782694795e-05, "loss": 2.5054, "step": 2337000 }, { "epoch": 4.893115517299126, "grad_norm": 15.237692832946777, "learning_rate": 2.55879965262045e-05, "loss": 2.5005, "step": 2337500 }, { "epoch": 4.8941621730247515, "grad_norm": 16.706188201904297, "learning_rate": 2.558275226971421e-05, "loss": 2.5242, "step": 2338000 }, { "epoch": 4.895208828750377, "grad_norm": 13.544087409973145, "learning_rate": 2.5577508013223918e-05, "loss": 2.5038, "step": 2338500 }, { "epoch": 4.8962554844760025, "grad_norm": 13.597088813781738, "learning_rate": 2.557226375673363e-05, "loss": 2.4788, "step": 2339000 }, { "epoch": 4.897302140201628, "grad_norm": 19.618803024291992, "learning_rate": 2.5567019500243333e-05, "loss": 2.497, "step": 2339500 }, { "epoch": 4.898348795927253, "grad_norm": 14.893771171569824, "learning_rate": 2.5561775243753044e-05, "loss": 2.4954, "step": 2340000 }, { "epoch": 4.899395451652879, "grad_norm": 14.016271591186523, "learning_rate": 2.555653098726275e-05, "loss": 2.5081, "step": 2340500 }, { "epoch": 4.900442107378504, "grad_norm": 14.301826477050781, "learning_rate": 2.555128673077246e-05, "loss": 2.5041, "step": 2341000 }, { "epoch": 4.90148876310413, "grad_norm": 14.694372177124023, "learning_rate": 2.5546042474282167e-05, "loss": 2.5088, "step": 2341500 }, { "epoch": 4.902535418829755, "grad_norm": 15.628500938415527, "learning_rate": 2.554079821779188e-05, "loss": 2.4789, "step": 2342000 }, { "epoch": 4.903582074555381, "grad_norm": 13.712932586669922, "learning_rate": 2.5535553961301583e-05, "loss": 2.5022, "step": 2342500 }, { "epoch": 4.904628730281006, "grad_norm": 16.208005905151367, "learning_rate": 2.553030970481129e-05, "loss": 2.5268, "step": 2343000 }, { "epoch": 4.9056753860066316, "grad_norm": 12.918237686157227, "learning_rate": 2.5525065448321e-05, "loss": 2.514, "step": 2343500 }, { "epoch": 4.906722041732257, "grad_norm": 16.16494369506836, "learning_rate": 2.5519821191830706e-05, "loss": 2.5028, "step": 2344000 }, { "epoch": 4.9077686974578825, "grad_norm": 12.405097961425781, "learning_rate": 2.5514576935340417e-05, "loss": 2.5124, "step": 2344500 }, { "epoch": 4.908815353183508, "grad_norm": 14.580911636352539, "learning_rate": 2.5509332678850125e-05, "loss": 2.512, "step": 2345000 }, { "epoch": 4.909862008909133, "grad_norm": 17.34947967529297, "learning_rate": 2.5504088422359836e-05, "loss": 2.5171, "step": 2345500 }, { "epoch": 4.910908664634759, "grad_norm": 21.08298110961914, "learning_rate": 2.549884416586954e-05, "loss": 2.506, "step": 2346000 }, { "epoch": 4.911955320360384, "grad_norm": 15.565848350524902, "learning_rate": 2.549359990937925e-05, "loss": 2.4977, "step": 2346500 }, { "epoch": 4.91300197608601, "grad_norm": 14.575974464416504, "learning_rate": 2.5488355652888955e-05, "loss": 2.5084, "step": 2347000 }, { "epoch": 4.914048631811635, "grad_norm": 12.379159927368164, "learning_rate": 2.5483111396398666e-05, "loss": 2.493, "step": 2347500 }, { "epoch": 4.915095287537261, "grad_norm": 15.616188049316406, "learning_rate": 2.5477867139908374e-05, "loss": 2.5076, "step": 2348000 }, { "epoch": 4.916141943262886, "grad_norm": 15.736719131469727, "learning_rate": 2.547262288341808e-05, "loss": 2.4888, "step": 2348500 }, { "epoch": 4.917188598988512, "grad_norm": 14.74796199798584, "learning_rate": 2.546737862692779e-05, "loss": 2.4992, "step": 2349000 }, { "epoch": 4.918235254714137, "grad_norm": 17.856752395629883, "learning_rate": 2.5462134370437497e-05, "loss": 2.5045, "step": 2349500 }, { "epoch": 4.9192819104397625, "grad_norm": 13.819202423095703, "learning_rate": 2.545689011394721e-05, "loss": 2.5136, "step": 2350000 }, { "epoch": 4.920328566165388, "grad_norm": 14.163532257080078, "learning_rate": 2.5451645857456913e-05, "loss": 2.5047, "step": 2350500 }, { "epoch": 4.921375221891013, "grad_norm": 16.23557472229004, "learning_rate": 2.5446401600966624e-05, "loss": 2.5134, "step": 2351000 }, { "epoch": 4.922421877616639, "grad_norm": 14.925554275512695, "learning_rate": 2.5441157344476328e-05, "loss": 2.5055, "step": 2351500 }, { "epoch": 4.923468533342264, "grad_norm": 15.190961837768555, "learning_rate": 2.543591308798604e-05, "loss": 2.5206, "step": 2352000 }, { "epoch": 4.924515189067891, "grad_norm": 18.179338455200195, "learning_rate": 2.5430668831495747e-05, "loss": 2.5104, "step": 2352500 }, { "epoch": 4.925561844793516, "grad_norm": 15.305755615234375, "learning_rate": 2.5425424575005458e-05, "loss": 2.507, "step": 2353000 }, { "epoch": 4.926608500519142, "grad_norm": 16.967416763305664, "learning_rate": 2.5420180318515162e-05, "loss": 2.5234, "step": 2353500 }, { "epoch": 4.927655156244767, "grad_norm": 14.766816139221191, "learning_rate": 2.541493606202487e-05, "loss": 2.4826, "step": 2354000 }, { "epoch": 4.9287018119703925, "grad_norm": 16.72176742553711, "learning_rate": 2.540969180553458e-05, "loss": 2.5037, "step": 2354500 }, { "epoch": 4.929748467696018, "grad_norm": 14.017334938049316, "learning_rate": 2.5404447549044285e-05, "loss": 2.5209, "step": 2355000 }, { "epoch": 4.930795123421643, "grad_norm": 15.102208137512207, "learning_rate": 2.5399203292553996e-05, "loss": 2.4942, "step": 2355500 }, { "epoch": 4.931841779147269, "grad_norm": 16.54803466796875, "learning_rate": 2.5393959036063704e-05, "loss": 2.4995, "step": 2356000 }, { "epoch": 4.932888434872894, "grad_norm": 18.33749771118164, "learning_rate": 2.5388714779573415e-05, "loss": 2.5009, "step": 2356500 }, { "epoch": 4.93393509059852, "grad_norm": 15.855558395385742, "learning_rate": 2.538347052308312e-05, "loss": 2.5125, "step": 2357000 }, { "epoch": 4.934981746324145, "grad_norm": 12.849908828735352, "learning_rate": 2.537822626659283e-05, "loss": 2.4765, "step": 2357500 }, { "epoch": 4.936028402049771, "grad_norm": 12.67774772644043, "learning_rate": 2.5372982010102535e-05, "loss": 2.5165, "step": 2358000 }, { "epoch": 4.937075057775396, "grad_norm": 15.584507942199707, "learning_rate": 2.5367737753612246e-05, "loss": 2.4924, "step": 2358500 }, { "epoch": 4.938121713501022, "grad_norm": 13.444579124450684, "learning_rate": 2.5362493497121954e-05, "loss": 2.5234, "step": 2359000 }, { "epoch": 4.939168369226647, "grad_norm": 16.250566482543945, "learning_rate": 2.5357249240631665e-05, "loss": 2.5076, "step": 2359500 }, { "epoch": 4.9402150249522725, "grad_norm": 15.106025695800781, "learning_rate": 2.535200498414137e-05, "loss": 2.5043, "step": 2360000 }, { "epoch": 4.941261680677898, "grad_norm": 15.239134788513184, "learning_rate": 2.5346760727651077e-05, "loss": 2.5058, "step": 2360500 }, { "epoch": 4.942308336403523, "grad_norm": 17.358144760131836, "learning_rate": 2.5341516471160788e-05, "loss": 2.5002, "step": 2361000 }, { "epoch": 4.943354992129149, "grad_norm": 13.634446144104004, "learning_rate": 2.5336272214670492e-05, "loss": 2.5112, "step": 2361500 }, { "epoch": 4.944401647854774, "grad_norm": 17.882991790771484, "learning_rate": 2.5331027958180203e-05, "loss": 2.5034, "step": 2362000 }, { "epoch": 4.9454483035804, "grad_norm": 21.545536041259766, "learning_rate": 2.532578370168991e-05, "loss": 2.5043, "step": 2362500 }, { "epoch": 4.946494959306025, "grad_norm": 12.567097663879395, "learning_rate": 2.5320539445199622e-05, "loss": 2.5276, "step": 2363000 }, { "epoch": 4.947541615031651, "grad_norm": 14.340062141418457, "learning_rate": 2.5315295188709326e-05, "loss": 2.5132, "step": 2363500 }, { "epoch": 4.948588270757276, "grad_norm": 17.0262508392334, "learning_rate": 2.5310050932219037e-05, "loss": 2.4959, "step": 2364000 }, { "epoch": 4.949634926482902, "grad_norm": 14.500162124633789, "learning_rate": 2.530480667572874e-05, "loss": 2.5172, "step": 2364500 }, { "epoch": 4.950681582208527, "grad_norm": 14.082984924316406, "learning_rate": 2.5299562419238453e-05, "loss": 2.4888, "step": 2365000 }, { "epoch": 4.9517282379341525, "grad_norm": 16.73431396484375, "learning_rate": 2.529431816274816e-05, "loss": 2.5131, "step": 2365500 }, { "epoch": 4.952774893659778, "grad_norm": 14.731694221496582, "learning_rate": 2.5289073906257865e-05, "loss": 2.4896, "step": 2366000 }, { "epoch": 4.953821549385403, "grad_norm": 17.68813133239746, "learning_rate": 2.5283829649767576e-05, "loss": 2.5072, "step": 2366500 }, { "epoch": 4.954868205111029, "grad_norm": 18.308490753173828, "learning_rate": 2.5278585393277283e-05, "loss": 2.5091, "step": 2367000 }, { "epoch": 4.955914860836654, "grad_norm": 16.838808059692383, "learning_rate": 2.5273341136786994e-05, "loss": 2.5133, "step": 2367500 }, { "epoch": 4.956961516562281, "grad_norm": 13.417041778564453, "learning_rate": 2.52680968802967e-05, "loss": 2.5075, "step": 2368000 }, { "epoch": 4.958008172287906, "grad_norm": 16.532649993896484, "learning_rate": 2.526285262380641e-05, "loss": 2.501, "step": 2368500 }, { "epoch": 4.959054828013532, "grad_norm": 19.53662109375, "learning_rate": 2.5257608367316114e-05, "loss": 2.4944, "step": 2369000 }, { "epoch": 4.960101483739157, "grad_norm": 14.533868789672852, "learning_rate": 2.5252364110825825e-05, "loss": 2.5092, "step": 2369500 }, { "epoch": 4.9611481394647825, "grad_norm": 15.753438949584961, "learning_rate": 2.5247119854335533e-05, "loss": 2.5218, "step": 2370000 }, { "epoch": 4.962194795190408, "grad_norm": 16.647422790527344, "learning_rate": 2.5241875597845244e-05, "loss": 2.532, "step": 2370500 }, { "epoch": 4.963241450916033, "grad_norm": 15.372187614440918, "learning_rate": 2.5236631341354948e-05, "loss": 2.4994, "step": 2371000 }, { "epoch": 4.964288106641659, "grad_norm": 14.639010429382324, "learning_rate": 2.5231387084864656e-05, "loss": 2.5095, "step": 2371500 }, { "epoch": 4.965334762367284, "grad_norm": 17.158214569091797, "learning_rate": 2.5226142828374367e-05, "loss": 2.5226, "step": 2372000 }, { "epoch": 4.96638141809291, "grad_norm": 15.388568878173828, "learning_rate": 2.522089857188407e-05, "loss": 2.5168, "step": 2372500 }, { "epoch": 4.967428073818535, "grad_norm": 14.092421531677246, "learning_rate": 2.5215654315393782e-05, "loss": 2.4977, "step": 2373000 }, { "epoch": 4.968474729544161, "grad_norm": 15.0305814743042, "learning_rate": 2.521041005890349e-05, "loss": 2.5098, "step": 2373500 }, { "epoch": 4.969521385269786, "grad_norm": 17.838096618652344, "learning_rate": 2.52051658024132e-05, "loss": 2.522, "step": 2374000 }, { "epoch": 4.970568040995412, "grad_norm": 14.350260734558105, "learning_rate": 2.5199921545922906e-05, "loss": 2.4911, "step": 2374500 }, { "epoch": 4.971614696721037, "grad_norm": 17.21567726135254, "learning_rate": 2.5194677289432617e-05, "loss": 2.5193, "step": 2375000 }, { "epoch": 4.9726613524466625, "grad_norm": 18.40346336364746, "learning_rate": 2.518943303294232e-05, "loss": 2.4995, "step": 2375500 }, { "epoch": 4.973708008172288, "grad_norm": 12.809713363647461, "learning_rate": 2.5184188776452032e-05, "loss": 2.5198, "step": 2376000 }, { "epoch": 4.974754663897913, "grad_norm": 15.601197242736816, "learning_rate": 2.517894451996174e-05, "loss": 2.5192, "step": 2376500 }, { "epoch": 4.975801319623539, "grad_norm": 16.747882843017578, "learning_rate": 2.5173700263471444e-05, "loss": 2.5028, "step": 2377000 }, { "epoch": 4.976847975349164, "grad_norm": 14.945947647094727, "learning_rate": 2.5168456006981155e-05, "loss": 2.4876, "step": 2377500 }, { "epoch": 4.97789463107479, "grad_norm": 14.54643440246582, "learning_rate": 2.5163211750490863e-05, "loss": 2.4887, "step": 2378000 }, { "epoch": 4.978941286800415, "grad_norm": 15.20840835571289, "learning_rate": 2.5157967494000574e-05, "loss": 2.5056, "step": 2378500 }, { "epoch": 4.979987942526041, "grad_norm": 14.193199157714844, "learning_rate": 2.5152723237510278e-05, "loss": 2.512, "step": 2379000 }, { "epoch": 4.981034598251666, "grad_norm": 16.899934768676758, "learning_rate": 2.514747898101999e-05, "loss": 2.5021, "step": 2379500 }, { "epoch": 4.982081253977292, "grad_norm": 16.156856536865234, "learning_rate": 2.5142234724529694e-05, "loss": 2.5032, "step": 2380000 }, { "epoch": 4.983127909702917, "grad_norm": 15.439732551574707, "learning_rate": 2.5136990468039408e-05, "loss": 2.5087, "step": 2380500 }, { "epoch": 4.9841745654285425, "grad_norm": 16.297109603881836, "learning_rate": 2.5131746211549112e-05, "loss": 2.5068, "step": 2381000 }, { "epoch": 4.985221221154168, "grad_norm": 16.001394271850586, "learning_rate": 2.5126501955058823e-05, "loss": 2.4962, "step": 2381500 }, { "epoch": 4.9862678768797934, "grad_norm": 18.69229507446289, "learning_rate": 2.5121257698568528e-05, "loss": 2.4963, "step": 2382000 }, { "epoch": 4.987314532605419, "grad_norm": 23.14253807067871, "learning_rate": 2.5116013442078235e-05, "loss": 2.495, "step": 2382500 }, { "epoch": 4.988361188331044, "grad_norm": 12.459425926208496, "learning_rate": 2.5110769185587946e-05, "loss": 2.5111, "step": 2383000 }, { "epoch": 4.98940784405667, "grad_norm": 13.16530990600586, "learning_rate": 2.510552492909765e-05, "loss": 2.5086, "step": 2383500 }, { "epoch": 4.990454499782295, "grad_norm": 17.70482635498047, "learning_rate": 2.5100280672607362e-05, "loss": 2.4925, "step": 2384000 }, { "epoch": 4.991501155507921, "grad_norm": 16.925888061523438, "learning_rate": 2.509503641611707e-05, "loss": 2.51, "step": 2384500 }, { "epoch": 4.992547811233546, "grad_norm": 13.683113098144531, "learning_rate": 2.508979215962678e-05, "loss": 2.4713, "step": 2385000 }, { "epoch": 4.993594466959172, "grad_norm": 14.90453815460205, "learning_rate": 2.5084547903136485e-05, "loss": 2.4866, "step": 2385500 }, { "epoch": 4.994641122684797, "grad_norm": 14.274913787841797, "learning_rate": 2.5079303646646196e-05, "loss": 2.5158, "step": 2386000 }, { "epoch": 4.9956877784104226, "grad_norm": 14.000239372253418, "learning_rate": 2.50740593901559e-05, "loss": 2.5006, "step": 2386500 }, { "epoch": 4.996734434136048, "grad_norm": 16.31768226623535, "learning_rate": 2.506881513366561e-05, "loss": 2.499, "step": 2387000 }, { "epoch": 4.997781089861674, "grad_norm": 13.601651191711426, "learning_rate": 2.506357087717532e-05, "loss": 2.5071, "step": 2387500 }, { "epoch": 4.9988277455873, "grad_norm": 14.876225471496582, "learning_rate": 2.5058326620685023e-05, "loss": 2.5159, "step": 2388000 }, { "epoch": 4.999874401312925, "grad_norm": 13.546016693115234, "learning_rate": 2.5053082364194734e-05, "loss": 2.4942, "step": 2388500 }, { "epoch": 5.000921057038551, "grad_norm": 16.981218338012695, "learning_rate": 2.5047838107704442e-05, "loss": 2.4972, "step": 2389000 }, { "epoch": 5.001967712764176, "grad_norm": 15.465879440307617, "learning_rate": 2.5042593851214153e-05, "loss": 2.496, "step": 2389500 }, { "epoch": 5.003014368489802, "grad_norm": 15.675577163696289, "learning_rate": 2.5037349594723858e-05, "loss": 2.4917, "step": 2390000 }, { "epoch": 5.004061024215427, "grad_norm": 13.556571960449219, "learning_rate": 2.503210533823357e-05, "loss": 2.492, "step": 2390500 }, { "epoch": 5.0051076799410525, "grad_norm": 13.35189437866211, "learning_rate": 2.5026861081743276e-05, "loss": 2.4858, "step": 2391000 }, { "epoch": 5.006154335666678, "grad_norm": 21.81891441345215, "learning_rate": 2.5021616825252987e-05, "loss": 2.4903, "step": 2391500 }, { "epoch": 5.0072009913923035, "grad_norm": 17.961732864379883, "learning_rate": 2.501637256876269e-05, "loss": 2.5072, "step": 2392000 }, { "epoch": 5.008247647117929, "grad_norm": 17.048500061035156, "learning_rate": 2.5011128312272403e-05, "loss": 2.4935, "step": 2392500 }, { "epoch": 5.009294302843554, "grad_norm": 15.966647148132324, "learning_rate": 2.5005884055782107e-05, "loss": 2.5086, "step": 2393000 }, { "epoch": 5.01034095856918, "grad_norm": 16.390535354614258, "learning_rate": 2.5000639799291818e-05, "loss": 2.4988, "step": 2393500 }, { "epoch": 5.011387614294805, "grad_norm": 14.977058410644531, "learning_rate": 2.4995395542801526e-05, "loss": 2.4922, "step": 2394000 }, { "epoch": 5.012434270020431, "grad_norm": 15.36262035369873, "learning_rate": 2.4990151286311234e-05, "loss": 2.4789, "step": 2394500 }, { "epoch": 5.013480925746056, "grad_norm": 15.877311706542969, "learning_rate": 2.498490702982094e-05, "loss": 2.4918, "step": 2395000 }, { "epoch": 5.014527581471682, "grad_norm": 16.14385223388672, "learning_rate": 2.497966277333065e-05, "loss": 2.4982, "step": 2395500 }, { "epoch": 5.015574237197307, "grad_norm": 14.418159484863281, "learning_rate": 2.497441851684036e-05, "loss": 2.4997, "step": 2396000 }, { "epoch": 5.016620892922933, "grad_norm": 14.839599609375, "learning_rate": 2.4969174260350064e-05, "loss": 2.4966, "step": 2396500 }, { "epoch": 5.017667548648558, "grad_norm": 16.715492248535156, "learning_rate": 2.4963930003859772e-05, "loss": 2.5008, "step": 2397000 }, { "epoch": 5.0187142043741835, "grad_norm": 18.614055633544922, "learning_rate": 2.495868574736948e-05, "loss": 2.5012, "step": 2397500 }, { "epoch": 5.019760860099809, "grad_norm": 14.893436431884766, "learning_rate": 2.495344149087919e-05, "loss": 2.4892, "step": 2398000 }, { "epoch": 5.020807515825434, "grad_norm": 14.951645851135254, "learning_rate": 2.49481972343889e-05, "loss": 2.488, "step": 2398500 }, { "epoch": 5.02185417155106, "grad_norm": 18.121728897094727, "learning_rate": 2.4942952977898606e-05, "loss": 2.5066, "step": 2399000 }, { "epoch": 5.022900827276685, "grad_norm": 18.317779541015625, "learning_rate": 2.4937708721408314e-05, "loss": 2.4948, "step": 2399500 }, { "epoch": 5.023947483002311, "grad_norm": 13.099725723266602, "learning_rate": 2.4932464464918025e-05, "loss": 2.4877, "step": 2400000 }, { "epoch": 5.024994138727936, "grad_norm": 17.715681076049805, "learning_rate": 2.4927220208427733e-05, "loss": 2.5085, "step": 2400500 }, { "epoch": 5.026040794453562, "grad_norm": 12.471216201782227, "learning_rate": 2.492197595193744e-05, "loss": 2.4793, "step": 2401000 }, { "epoch": 5.027087450179187, "grad_norm": 15.447701454162598, "learning_rate": 2.4916731695447148e-05, "loss": 2.5036, "step": 2401500 }, { "epoch": 5.028134105904813, "grad_norm": 13.136443138122559, "learning_rate": 2.4911487438956856e-05, "loss": 2.4951, "step": 2402000 }, { "epoch": 5.029180761630438, "grad_norm": 15.2191801071167, "learning_rate": 2.4906243182466563e-05, "loss": 2.4974, "step": 2402500 }, { "epoch": 5.0302274173560635, "grad_norm": 15.39077091217041, "learning_rate": 2.490099892597627e-05, "loss": 2.4865, "step": 2403000 }, { "epoch": 5.031274073081689, "grad_norm": 17.003013610839844, "learning_rate": 2.489575466948598e-05, "loss": 2.4969, "step": 2403500 }, { "epoch": 5.032320728807315, "grad_norm": 14.205581665039062, "learning_rate": 2.4890510412995686e-05, "loss": 2.4826, "step": 2404000 }, { "epoch": 5.033367384532941, "grad_norm": 16.212162017822266, "learning_rate": 2.4885266156505397e-05, "loss": 2.4852, "step": 2404500 }, { "epoch": 5.034414040258566, "grad_norm": 15.994488716125488, "learning_rate": 2.4880021900015105e-05, "loss": 2.5102, "step": 2405000 }, { "epoch": 5.035460695984192, "grad_norm": 13.53494644165039, "learning_rate": 2.4874777643524813e-05, "loss": 2.4821, "step": 2405500 }, { "epoch": 5.036507351709817, "grad_norm": 15.660935401916504, "learning_rate": 2.486953338703452e-05, "loss": 2.491, "step": 2406000 }, { "epoch": 5.037554007435443, "grad_norm": 14.451323509216309, "learning_rate": 2.4864289130544228e-05, "loss": 2.5116, "step": 2406500 }, { "epoch": 5.038600663161068, "grad_norm": 15.883487701416016, "learning_rate": 2.485904487405394e-05, "loss": 2.4864, "step": 2407000 }, { "epoch": 5.0396473188866935, "grad_norm": 15.36342716217041, "learning_rate": 2.4853800617563647e-05, "loss": 2.5015, "step": 2407500 }, { "epoch": 5.040693974612319, "grad_norm": 13.194411277770996, "learning_rate": 2.484855636107335e-05, "loss": 2.4862, "step": 2408000 }, { "epoch": 5.041740630337944, "grad_norm": 16.976150512695312, "learning_rate": 2.4843312104583062e-05, "loss": 2.4906, "step": 2408500 }, { "epoch": 5.04278728606357, "grad_norm": 14.936477661132812, "learning_rate": 2.483806784809277e-05, "loss": 2.4894, "step": 2409000 }, { "epoch": 5.043833941789195, "grad_norm": 16.701427459716797, "learning_rate": 2.4832823591602478e-05, "loss": 2.5168, "step": 2409500 }, { "epoch": 5.044880597514821, "grad_norm": 15.189602851867676, "learning_rate": 2.4827579335112185e-05, "loss": 2.479, "step": 2410000 }, { "epoch": 5.045927253240446, "grad_norm": 17.02239418029785, "learning_rate": 2.4822335078621893e-05, "loss": 2.4942, "step": 2410500 }, { "epoch": 5.046973908966072, "grad_norm": 14.301746368408203, "learning_rate": 2.4817090822131604e-05, "loss": 2.5108, "step": 2411000 }, { "epoch": 5.048020564691697, "grad_norm": 17.79118537902832, "learning_rate": 2.4811846565641312e-05, "loss": 2.4939, "step": 2411500 }, { "epoch": 5.049067220417323, "grad_norm": 13.926859855651855, "learning_rate": 2.480660230915102e-05, "loss": 2.4945, "step": 2412000 }, { "epoch": 5.050113876142948, "grad_norm": 15.321176528930664, "learning_rate": 2.4801358052660727e-05, "loss": 2.5009, "step": 2412500 }, { "epoch": 5.0511605318685735, "grad_norm": 19.06781578063965, "learning_rate": 2.4796113796170435e-05, "loss": 2.4923, "step": 2413000 }, { "epoch": 5.052207187594199, "grad_norm": 17.3582763671875, "learning_rate": 2.4790869539680143e-05, "loss": 2.4877, "step": 2413500 }, { "epoch": 5.053253843319824, "grad_norm": 15.361605644226074, "learning_rate": 2.478562528318985e-05, "loss": 2.4934, "step": 2414000 }, { "epoch": 5.05430049904545, "grad_norm": 16.004718780517578, "learning_rate": 2.4780381026699558e-05, "loss": 2.484, "step": 2414500 }, { "epoch": 5.055347154771075, "grad_norm": 15.82640552520752, "learning_rate": 2.4775136770209266e-05, "loss": 2.5223, "step": 2415000 }, { "epoch": 5.056393810496701, "grad_norm": 15.176981925964355, "learning_rate": 2.4769892513718977e-05, "loss": 2.4877, "step": 2415500 }, { "epoch": 5.057440466222326, "grad_norm": 15.167255401611328, "learning_rate": 2.4764648257228685e-05, "loss": 2.4959, "step": 2416000 }, { "epoch": 5.058487121947952, "grad_norm": 15.169243812561035, "learning_rate": 2.4759404000738392e-05, "loss": 2.4983, "step": 2416500 }, { "epoch": 5.059533777673577, "grad_norm": 16.31517791748047, "learning_rate": 2.47541597442481e-05, "loss": 2.4768, "step": 2417000 }, { "epoch": 5.060580433399203, "grad_norm": 13.952658653259277, "learning_rate": 2.474891548775781e-05, "loss": 2.5177, "step": 2417500 }, { "epoch": 5.061627089124828, "grad_norm": 14.39082145690918, "learning_rate": 2.474367123126752e-05, "loss": 2.5001, "step": 2418000 }, { "epoch": 5.0626737448504535, "grad_norm": 13.167707443237305, "learning_rate": 2.4738426974777226e-05, "loss": 2.4802, "step": 2418500 }, { "epoch": 5.063720400576079, "grad_norm": 15.049192428588867, "learning_rate": 2.4733182718286934e-05, "loss": 2.4823, "step": 2419000 }, { "epoch": 5.064767056301704, "grad_norm": 16.556591033935547, "learning_rate": 2.4727938461796642e-05, "loss": 2.4929, "step": 2419500 }, { "epoch": 5.06581371202733, "grad_norm": 15.610702514648438, "learning_rate": 2.472269420530635e-05, "loss": 2.4877, "step": 2420000 }, { "epoch": 5.066860367752955, "grad_norm": 14.017250061035156, "learning_rate": 2.4717449948816057e-05, "loss": 2.5075, "step": 2420500 }, { "epoch": 5.067907023478581, "grad_norm": 13.670541763305664, "learning_rate": 2.4712205692325765e-05, "loss": 2.4932, "step": 2421000 }, { "epoch": 5.068953679204207, "grad_norm": 16.480525970458984, "learning_rate": 2.4706961435835473e-05, "loss": 2.4761, "step": 2421500 }, { "epoch": 5.070000334929833, "grad_norm": 15.679006576538086, "learning_rate": 2.4701717179345184e-05, "loss": 2.5066, "step": 2422000 }, { "epoch": 5.071046990655458, "grad_norm": 14.6192626953125, "learning_rate": 2.469647292285489e-05, "loss": 2.4969, "step": 2422500 }, { "epoch": 5.0720936463810835, "grad_norm": 13.077757835388184, "learning_rate": 2.46912286663646e-05, "loss": 2.481, "step": 2423000 }, { "epoch": 5.073140302106709, "grad_norm": 13.648320198059082, "learning_rate": 2.4685984409874307e-05, "loss": 2.4782, "step": 2423500 }, { "epoch": 5.074186957832334, "grad_norm": 16.502445220947266, "learning_rate": 2.4680740153384014e-05, "loss": 2.4758, "step": 2424000 }, { "epoch": 5.07523361355796, "grad_norm": 15.913890838623047, "learning_rate": 2.4675495896893725e-05, "loss": 2.5079, "step": 2424500 }, { "epoch": 5.076280269283585, "grad_norm": 18.35245704650879, "learning_rate": 2.467025164040343e-05, "loss": 2.4942, "step": 2425000 }, { "epoch": 5.077326925009211, "grad_norm": 13.276002883911133, "learning_rate": 2.4665007383913137e-05, "loss": 2.5157, "step": 2425500 }, { "epoch": 5.078373580734836, "grad_norm": 13.606440544128418, "learning_rate": 2.465976312742285e-05, "loss": 2.4931, "step": 2426000 }, { "epoch": 5.079420236460462, "grad_norm": 23.86888313293457, "learning_rate": 2.4654518870932556e-05, "loss": 2.4854, "step": 2426500 }, { "epoch": 5.080466892186087, "grad_norm": 16.214014053344727, "learning_rate": 2.4649274614442264e-05, "loss": 2.489, "step": 2427000 }, { "epoch": 5.081513547911713, "grad_norm": 17.554887771606445, "learning_rate": 2.464403035795197e-05, "loss": 2.4834, "step": 2427500 }, { "epoch": 5.082560203637338, "grad_norm": 14.94356918334961, "learning_rate": 2.463878610146168e-05, "loss": 2.4821, "step": 2428000 }, { "epoch": 5.0836068593629635, "grad_norm": 19.421241760253906, "learning_rate": 2.463354184497139e-05, "loss": 2.4966, "step": 2428500 }, { "epoch": 5.084653515088589, "grad_norm": 18.493684768676758, "learning_rate": 2.4628297588481098e-05, "loss": 2.4984, "step": 2429000 }, { "epoch": 5.085700170814214, "grad_norm": 17.541091918945312, "learning_rate": 2.4623053331990806e-05, "loss": 2.4968, "step": 2429500 }, { "epoch": 5.08674682653984, "grad_norm": 13.90014934539795, "learning_rate": 2.4617809075500513e-05, "loss": 2.5067, "step": 2430000 }, { "epoch": 5.087793482265465, "grad_norm": 13.382936477661133, "learning_rate": 2.461256481901022e-05, "loss": 2.5131, "step": 2430500 }, { "epoch": 5.088840137991091, "grad_norm": 17.824268341064453, "learning_rate": 2.460732056251993e-05, "loss": 2.5141, "step": 2431000 }, { "epoch": 5.089886793716716, "grad_norm": 22.197551727294922, "learning_rate": 2.4602076306029637e-05, "loss": 2.4932, "step": 2431500 }, { "epoch": 5.090933449442342, "grad_norm": 16.433500289916992, "learning_rate": 2.4596832049539344e-05, "loss": 2.4971, "step": 2432000 }, { "epoch": 5.091980105167967, "grad_norm": 16.37550163269043, "learning_rate": 2.4591587793049052e-05, "loss": 2.504, "step": 2432500 }, { "epoch": 5.093026760893593, "grad_norm": 13.474369049072266, "learning_rate": 2.4586343536558763e-05, "loss": 2.4877, "step": 2433000 }, { "epoch": 5.094073416619218, "grad_norm": 15.474559783935547, "learning_rate": 2.458109928006847e-05, "loss": 2.5088, "step": 2433500 }, { "epoch": 5.0951200723448435, "grad_norm": 14.736139297485352, "learning_rate": 2.457585502357818e-05, "loss": 2.4835, "step": 2434000 }, { "epoch": 5.096166728070469, "grad_norm": 14.648402214050293, "learning_rate": 2.4570610767087886e-05, "loss": 2.5071, "step": 2434500 }, { "epoch": 5.0972133837960945, "grad_norm": 32.21669387817383, "learning_rate": 2.4565366510597597e-05, "loss": 2.4809, "step": 2435000 }, { "epoch": 5.09826003952172, "grad_norm": 16.812631607055664, "learning_rate": 2.4560122254107305e-05, "loss": 2.5076, "step": 2435500 }, { "epoch": 5.099306695247345, "grad_norm": 14.130071640014648, "learning_rate": 2.4554877997617013e-05, "loss": 2.505, "step": 2436000 }, { "epoch": 5.100353350972971, "grad_norm": 14.77419376373291, "learning_rate": 2.4549633741126717e-05, "loss": 2.4844, "step": 2436500 }, { "epoch": 5.101400006698596, "grad_norm": 13.64598274230957, "learning_rate": 2.4544389484636428e-05, "loss": 2.4814, "step": 2437000 }, { "epoch": 5.102446662424222, "grad_norm": 16.022064208984375, "learning_rate": 2.4539145228146136e-05, "loss": 2.4973, "step": 2437500 }, { "epoch": 5.103493318149847, "grad_norm": 17.226900100708008, "learning_rate": 2.4533900971655843e-05, "loss": 2.5139, "step": 2438000 }, { "epoch": 5.1045399738754735, "grad_norm": 15.751826286315918, "learning_rate": 2.452865671516555e-05, "loss": 2.4989, "step": 2438500 }, { "epoch": 5.105586629601099, "grad_norm": 14.608020782470703, "learning_rate": 2.452341245867526e-05, "loss": 2.48, "step": 2439000 }, { "epoch": 5.1066332853267244, "grad_norm": 16.178720474243164, "learning_rate": 2.451816820218497e-05, "loss": 2.4834, "step": 2439500 }, { "epoch": 5.10767994105235, "grad_norm": 16.798158645629883, "learning_rate": 2.4512923945694677e-05, "loss": 2.5169, "step": 2440000 }, { "epoch": 5.108726596777975, "grad_norm": 14.521955490112305, "learning_rate": 2.4507679689204385e-05, "loss": 2.4675, "step": 2440500 }, { "epoch": 5.109773252503601, "grad_norm": 16.28104019165039, "learning_rate": 2.4502435432714093e-05, "loss": 2.4871, "step": 2441000 }, { "epoch": 5.110819908229226, "grad_norm": 14.21634292602539, "learning_rate": 2.44971911762238e-05, "loss": 2.506, "step": 2441500 }, { "epoch": 5.111866563954852, "grad_norm": 14.540234565734863, "learning_rate": 2.4491946919733508e-05, "loss": 2.4928, "step": 2442000 }, { "epoch": 5.112913219680477, "grad_norm": 13.608769416809082, "learning_rate": 2.4486702663243216e-05, "loss": 2.487, "step": 2442500 }, { "epoch": 5.113959875406103, "grad_norm": 13.487133979797363, "learning_rate": 2.4481458406752924e-05, "loss": 2.5001, "step": 2443000 }, { "epoch": 5.115006531131728, "grad_norm": 14.311955451965332, "learning_rate": 2.447621415026263e-05, "loss": 2.5117, "step": 2443500 }, { "epoch": 5.1160531868573536, "grad_norm": 20.288549423217773, "learning_rate": 2.4470969893772342e-05, "loss": 2.4738, "step": 2444000 }, { "epoch": 5.117099842582979, "grad_norm": 16.4526424407959, "learning_rate": 2.446572563728205e-05, "loss": 2.488, "step": 2444500 }, { "epoch": 5.1181464983086045, "grad_norm": 15.822854042053223, "learning_rate": 2.4460481380791758e-05, "loss": 2.5018, "step": 2445000 }, { "epoch": 5.11919315403423, "grad_norm": 16.360877990722656, "learning_rate": 2.4455237124301465e-05, "loss": 2.479, "step": 2445500 }, { "epoch": 5.120239809759855, "grad_norm": 14.683219909667969, "learning_rate": 2.4449992867811177e-05, "loss": 2.5027, "step": 2446000 }, { "epoch": 5.121286465485481, "grad_norm": 15.109214782714844, "learning_rate": 2.4444748611320884e-05, "loss": 2.4983, "step": 2446500 }, { "epoch": 5.122333121211106, "grad_norm": 19.772552490234375, "learning_rate": 2.4439504354830592e-05, "loss": 2.4919, "step": 2447000 }, { "epoch": 5.123379776936732, "grad_norm": 16.106868743896484, "learning_rate": 2.4434260098340296e-05, "loss": 2.4972, "step": 2447500 }, { "epoch": 5.124426432662357, "grad_norm": 18.27731704711914, "learning_rate": 2.4429015841850007e-05, "loss": 2.506, "step": 2448000 }, { "epoch": 5.125473088387983, "grad_norm": 16.16326141357422, "learning_rate": 2.4423771585359715e-05, "loss": 2.5112, "step": 2448500 }, { "epoch": 5.126519744113608, "grad_norm": 13.622764587402344, "learning_rate": 2.4418527328869423e-05, "loss": 2.4795, "step": 2449000 }, { "epoch": 5.127566399839234, "grad_norm": 15.761818885803223, "learning_rate": 2.441328307237913e-05, "loss": 2.4874, "step": 2449500 }, { "epoch": 5.128613055564859, "grad_norm": 22.474380493164062, "learning_rate": 2.4408038815888838e-05, "loss": 2.4941, "step": 2450000 }, { "epoch": 5.1296597112904845, "grad_norm": 15.366142272949219, "learning_rate": 2.440279455939855e-05, "loss": 2.4828, "step": 2450500 }, { "epoch": 5.13070636701611, "grad_norm": 20.884077072143555, "learning_rate": 2.4397550302908257e-05, "loss": 2.5006, "step": 2451000 }, { "epoch": 5.131753022741735, "grad_norm": 14.36867618560791, "learning_rate": 2.4392306046417965e-05, "loss": 2.4806, "step": 2451500 }, { "epoch": 5.132799678467361, "grad_norm": 15.867644309997559, "learning_rate": 2.4387061789927672e-05, "loss": 2.4953, "step": 2452000 }, { "epoch": 5.133846334192986, "grad_norm": 15.137541770935059, "learning_rate": 2.4381817533437383e-05, "loss": 2.5035, "step": 2452500 }, { "epoch": 5.134892989918612, "grad_norm": 16.74985122680664, "learning_rate": 2.437657327694709e-05, "loss": 2.5089, "step": 2453000 }, { "epoch": 5.135939645644237, "grad_norm": 14.81727409362793, "learning_rate": 2.4371329020456795e-05, "loss": 2.5027, "step": 2453500 }, { "epoch": 5.136986301369863, "grad_norm": 14.520087242126465, "learning_rate": 2.4366084763966503e-05, "loss": 2.49, "step": 2454000 }, { "epoch": 5.138032957095488, "grad_norm": 14.785577774047852, "learning_rate": 2.4360840507476214e-05, "loss": 2.4881, "step": 2454500 }, { "epoch": 5.139079612821114, "grad_norm": 18.962923049926758, "learning_rate": 2.4355596250985922e-05, "loss": 2.4959, "step": 2455000 }, { "epoch": 5.140126268546739, "grad_norm": 14.08920955657959, "learning_rate": 2.435035199449563e-05, "loss": 2.4847, "step": 2455500 }, { "epoch": 5.1411729242723645, "grad_norm": 13.883313179016113, "learning_rate": 2.4345107738005337e-05, "loss": 2.4913, "step": 2456000 }, { "epoch": 5.142219579997991, "grad_norm": 15.100257873535156, "learning_rate": 2.4339863481515045e-05, "loss": 2.5094, "step": 2456500 }, { "epoch": 5.143266235723616, "grad_norm": 13.283472061157227, "learning_rate": 2.4334619225024756e-05, "loss": 2.4797, "step": 2457000 }, { "epoch": 5.144312891449242, "grad_norm": 13.802899360656738, "learning_rate": 2.4329374968534464e-05, "loss": 2.4899, "step": 2457500 }, { "epoch": 5.145359547174867, "grad_norm": 14.525918960571289, "learning_rate": 2.432413071204417e-05, "loss": 2.5046, "step": 2458000 }, { "epoch": 5.146406202900493, "grad_norm": 18.7606258392334, "learning_rate": 2.431888645555388e-05, "loss": 2.4805, "step": 2458500 }, { "epoch": 5.147452858626118, "grad_norm": 17.249479293823242, "learning_rate": 2.4313642199063587e-05, "loss": 2.4877, "step": 2459000 }, { "epoch": 5.148499514351744, "grad_norm": 16.8850040435791, "learning_rate": 2.4308397942573294e-05, "loss": 2.4729, "step": 2459500 }, { "epoch": 5.149546170077369, "grad_norm": 15.417181968688965, "learning_rate": 2.4303153686083002e-05, "loss": 2.4757, "step": 2460000 }, { "epoch": 5.1505928258029945, "grad_norm": 16.863426208496094, "learning_rate": 2.429790942959271e-05, "loss": 2.4768, "step": 2460500 }, { "epoch": 5.15163948152862, "grad_norm": 16.16876792907715, "learning_rate": 2.4292665173102417e-05, "loss": 2.4842, "step": 2461000 }, { "epoch": 5.152686137254245, "grad_norm": 14.942794799804688, "learning_rate": 2.428742091661213e-05, "loss": 2.4816, "step": 2461500 }, { "epoch": 5.153732792979871, "grad_norm": 16.60542869567871, "learning_rate": 2.4282176660121836e-05, "loss": 2.5039, "step": 2462000 }, { "epoch": 5.154779448705496, "grad_norm": 13.678284645080566, "learning_rate": 2.4276932403631544e-05, "loss": 2.4779, "step": 2462500 }, { "epoch": 5.155826104431122, "grad_norm": 14.330940246582031, "learning_rate": 2.427168814714125e-05, "loss": 2.4727, "step": 2463000 }, { "epoch": 5.156872760156747, "grad_norm": 16.774805068969727, "learning_rate": 2.4266443890650963e-05, "loss": 2.4762, "step": 2463500 }, { "epoch": 5.157919415882373, "grad_norm": 15.577851295471191, "learning_rate": 2.426119963416067e-05, "loss": 2.4814, "step": 2464000 }, { "epoch": 5.158966071607998, "grad_norm": 18.350933074951172, "learning_rate": 2.4255955377670375e-05, "loss": 2.4744, "step": 2464500 }, { "epoch": 5.160012727333624, "grad_norm": 15.862805366516113, "learning_rate": 2.4250711121180082e-05, "loss": 2.5037, "step": 2465000 }, { "epoch": 5.161059383059249, "grad_norm": 18.95450210571289, "learning_rate": 2.4245466864689793e-05, "loss": 2.477, "step": 2465500 }, { "epoch": 5.1621060387848745, "grad_norm": 16.821691513061523, "learning_rate": 2.42402226081995e-05, "loss": 2.4933, "step": 2466000 }, { "epoch": 5.1631526945105, "grad_norm": 14.243582725524902, "learning_rate": 2.423497835170921e-05, "loss": 2.4739, "step": 2466500 }, { "epoch": 5.164199350236125, "grad_norm": 15.022834777832031, "learning_rate": 2.4229734095218916e-05, "loss": 2.4835, "step": 2467000 }, { "epoch": 5.165246005961751, "grad_norm": 14.500730514526367, "learning_rate": 2.4224489838728624e-05, "loss": 2.4986, "step": 2467500 }, { "epoch": 5.166292661687376, "grad_norm": 13.78397274017334, "learning_rate": 2.4219245582238335e-05, "loss": 2.4918, "step": 2468000 }, { "epoch": 5.167339317413002, "grad_norm": 17.67623519897461, "learning_rate": 2.4214001325748043e-05, "loss": 2.4985, "step": 2468500 }, { "epoch": 5.168385973138627, "grad_norm": 16.453014373779297, "learning_rate": 2.420875706925775e-05, "loss": 2.4882, "step": 2469000 }, { "epoch": 5.169432628864253, "grad_norm": 15.640151023864746, "learning_rate": 2.420351281276746e-05, "loss": 2.4989, "step": 2469500 }, { "epoch": 5.170479284589878, "grad_norm": 14.780866622924805, "learning_rate": 2.4198268556277166e-05, "loss": 2.4772, "step": 2470000 }, { "epoch": 5.171525940315504, "grad_norm": 16.041622161865234, "learning_rate": 2.4193024299786874e-05, "loss": 2.4945, "step": 2470500 }, { "epoch": 5.172572596041129, "grad_norm": 13.463412284851074, "learning_rate": 2.418778004329658e-05, "loss": 2.4877, "step": 2471000 }, { "epoch": 5.1736192517667545, "grad_norm": 17.434764862060547, "learning_rate": 2.418253578680629e-05, "loss": 2.5116, "step": 2471500 }, { "epoch": 5.17466590749238, "grad_norm": 14.007652282714844, "learning_rate": 2.4177291530316e-05, "loss": 2.4976, "step": 2472000 }, { "epoch": 5.175712563218005, "grad_norm": 14.858572959899902, "learning_rate": 2.4172047273825708e-05, "loss": 2.4934, "step": 2472500 }, { "epoch": 5.176759218943631, "grad_norm": 15.026376724243164, "learning_rate": 2.4166803017335416e-05, "loss": 2.4898, "step": 2473000 }, { "epoch": 5.177805874669257, "grad_norm": 15.413891792297363, "learning_rate": 2.4161558760845123e-05, "loss": 2.4914, "step": 2473500 }, { "epoch": 5.178852530394883, "grad_norm": 15.837128639221191, "learning_rate": 2.415631450435483e-05, "loss": 2.4786, "step": 2474000 }, { "epoch": 5.179899186120508, "grad_norm": 22.104379653930664, "learning_rate": 2.4151070247864542e-05, "loss": 2.4965, "step": 2474500 }, { "epoch": 5.180945841846134, "grad_norm": 15.589006423950195, "learning_rate": 2.414582599137425e-05, "loss": 2.4585, "step": 2475000 }, { "epoch": 5.181992497571759, "grad_norm": 14.378357887268066, "learning_rate": 2.4140581734883957e-05, "loss": 2.501, "step": 2475500 }, { "epoch": 5.1830391532973845, "grad_norm": 13.881062507629395, "learning_rate": 2.4135337478393662e-05, "loss": 2.4888, "step": 2476000 }, { "epoch": 5.18408580902301, "grad_norm": 15.329877853393555, "learning_rate": 2.4130093221903373e-05, "loss": 2.4619, "step": 2476500 }, { "epoch": 5.185132464748635, "grad_norm": 13.395918846130371, "learning_rate": 2.412484896541308e-05, "loss": 2.4825, "step": 2477000 }, { "epoch": 5.186179120474261, "grad_norm": 15.702645301818848, "learning_rate": 2.4119604708922788e-05, "loss": 2.5278, "step": 2477500 }, { "epoch": 5.187225776199886, "grad_norm": 15.012467384338379, "learning_rate": 2.4114360452432496e-05, "loss": 2.4925, "step": 2478000 }, { "epoch": 5.188272431925512, "grad_norm": 17.208467483520508, "learning_rate": 2.4109116195942204e-05, "loss": 2.4957, "step": 2478500 }, { "epoch": 5.189319087651137, "grad_norm": 15.785093307495117, "learning_rate": 2.4103871939451915e-05, "loss": 2.4767, "step": 2479000 }, { "epoch": 5.190365743376763, "grad_norm": 15.554059982299805, "learning_rate": 2.4098627682961622e-05, "loss": 2.4765, "step": 2479500 }, { "epoch": 5.191412399102388, "grad_norm": 16.719207763671875, "learning_rate": 2.409338342647133e-05, "loss": 2.4869, "step": 2480000 }, { "epoch": 5.192459054828014, "grad_norm": 15.791983604431152, "learning_rate": 2.4088139169981038e-05, "loss": 2.4865, "step": 2480500 }, { "epoch": 5.193505710553639, "grad_norm": 15.904462814331055, "learning_rate": 2.408289491349075e-05, "loss": 2.5, "step": 2481000 }, { "epoch": 5.1945523662792645, "grad_norm": 20.68381690979004, "learning_rate": 2.4077650657000456e-05, "loss": 2.458, "step": 2481500 }, { "epoch": 5.19559902200489, "grad_norm": 15.218734741210938, "learning_rate": 2.407240640051016e-05, "loss": 2.476, "step": 2482000 }, { "epoch": 5.1966456777305154, "grad_norm": 15.856230735778809, "learning_rate": 2.406716214401987e-05, "loss": 2.4861, "step": 2482500 }, { "epoch": 5.197692333456141, "grad_norm": 16.426855087280273, "learning_rate": 2.406191788752958e-05, "loss": 2.4919, "step": 2483000 }, { "epoch": 5.198738989181766, "grad_norm": 15.60031509399414, "learning_rate": 2.4056673631039287e-05, "loss": 2.4756, "step": 2483500 }, { "epoch": 5.199785644907392, "grad_norm": 15.202068328857422, "learning_rate": 2.4051429374548995e-05, "loss": 2.4684, "step": 2484000 }, { "epoch": 5.200832300633017, "grad_norm": 27.158231735229492, "learning_rate": 2.4046185118058703e-05, "loss": 2.4796, "step": 2484500 }, { "epoch": 5.201878956358643, "grad_norm": 13.651429176330566, "learning_rate": 2.404094086156841e-05, "loss": 2.4774, "step": 2485000 }, { "epoch": 5.202925612084268, "grad_norm": 14.937368392944336, "learning_rate": 2.403569660507812e-05, "loss": 2.4961, "step": 2485500 }, { "epoch": 5.203972267809894, "grad_norm": 13.994231224060059, "learning_rate": 2.403045234858783e-05, "loss": 2.4871, "step": 2486000 }, { "epoch": 5.205018923535519, "grad_norm": 17.058256149291992, "learning_rate": 2.4025208092097537e-05, "loss": 2.4905, "step": 2486500 }, { "epoch": 5.2060655792611445, "grad_norm": 15.46399211883545, "learning_rate": 2.4019963835607244e-05, "loss": 2.4979, "step": 2487000 }, { "epoch": 5.20711223498677, "grad_norm": 16.044858932495117, "learning_rate": 2.4014719579116952e-05, "loss": 2.507, "step": 2487500 }, { "epoch": 5.2081588907123955, "grad_norm": 17.030052185058594, "learning_rate": 2.400947532262666e-05, "loss": 2.4897, "step": 2488000 }, { "epoch": 5.209205546438021, "grad_norm": 15.33266544342041, "learning_rate": 2.4004231066136368e-05, "loss": 2.4925, "step": 2488500 }, { "epoch": 5.210252202163646, "grad_norm": 15.89932632446289, "learning_rate": 2.3998986809646075e-05, "loss": 2.4806, "step": 2489000 }, { "epoch": 5.211298857889272, "grad_norm": 14.150468826293945, "learning_rate": 2.3993742553155786e-05, "loss": 2.4797, "step": 2489500 }, { "epoch": 5.212345513614897, "grad_norm": 16.695873260498047, "learning_rate": 2.3988498296665494e-05, "loss": 2.4985, "step": 2490000 }, { "epoch": 5.213392169340523, "grad_norm": 14.053078651428223, "learning_rate": 2.3983254040175202e-05, "loss": 2.4925, "step": 2490500 }, { "epoch": 5.214438825066148, "grad_norm": 17.218412399291992, "learning_rate": 2.397800978368491e-05, "loss": 2.4768, "step": 2491000 }, { "epoch": 5.2154854807917745, "grad_norm": 17.59647560119629, "learning_rate": 2.3972765527194617e-05, "loss": 2.4753, "step": 2491500 }, { "epoch": 5.2165321365174, "grad_norm": 20.728342056274414, "learning_rate": 2.3967521270704328e-05, "loss": 2.4852, "step": 2492000 }, { "epoch": 5.2175787922430255, "grad_norm": 15.898322105407715, "learning_rate": 2.3962277014214036e-05, "loss": 2.4679, "step": 2492500 }, { "epoch": 5.218625447968651, "grad_norm": 17.477876663208008, "learning_rate": 2.395703275772374e-05, "loss": 2.4787, "step": 2493000 }, { "epoch": 5.219672103694276, "grad_norm": 13.97575855255127, "learning_rate": 2.3951788501233448e-05, "loss": 2.4979, "step": 2493500 }, { "epoch": 5.220718759419902, "grad_norm": 15.575037002563477, "learning_rate": 2.394654424474316e-05, "loss": 2.4931, "step": 2494000 }, { "epoch": 5.221765415145527, "grad_norm": 15.045036315917969, "learning_rate": 2.3941299988252867e-05, "loss": 2.4812, "step": 2494500 }, { "epoch": 5.222812070871153, "grad_norm": 13.52884292602539, "learning_rate": 2.3936055731762574e-05, "loss": 2.4888, "step": 2495000 }, { "epoch": 5.223858726596778, "grad_norm": 17.21628761291504, "learning_rate": 2.3930811475272282e-05, "loss": 2.5054, "step": 2495500 }, { "epoch": 5.224905382322404, "grad_norm": 14.356765747070312, "learning_rate": 2.392556721878199e-05, "loss": 2.4821, "step": 2496000 }, { "epoch": 5.225952038048029, "grad_norm": 15.265840530395508, "learning_rate": 2.39203229622917e-05, "loss": 2.4852, "step": 2496500 }, { "epoch": 5.226998693773655, "grad_norm": 15.982207298278809, "learning_rate": 2.391507870580141e-05, "loss": 2.4969, "step": 2497000 }, { "epoch": 5.22804534949928, "grad_norm": 14.976497650146484, "learning_rate": 2.3909834449311116e-05, "loss": 2.4881, "step": 2497500 }, { "epoch": 5.2290920052249055, "grad_norm": 13.918831825256348, "learning_rate": 2.3904590192820824e-05, "loss": 2.4731, "step": 2498000 }, { "epoch": 5.230138660950531, "grad_norm": 17.581161499023438, "learning_rate": 2.3899345936330535e-05, "loss": 2.4837, "step": 2498500 }, { "epoch": 5.231185316676156, "grad_norm": 14.684821128845215, "learning_rate": 2.389410167984024e-05, "loss": 2.4902, "step": 2499000 }, { "epoch": 5.232231972401782, "grad_norm": 16.78931427001953, "learning_rate": 2.3888857423349947e-05, "loss": 2.4754, "step": 2499500 }, { "epoch": 5.233278628127407, "grad_norm": 14.199698448181152, "learning_rate": 2.3883613166859655e-05, "loss": 2.4776, "step": 2500000 }, { "epoch": 5.234325283853033, "grad_norm": 14.010775566101074, "learning_rate": 2.3878368910369366e-05, "loss": 2.5026, "step": 2500500 }, { "epoch": 5.235371939578658, "grad_norm": 17.938671112060547, "learning_rate": 2.3873124653879073e-05, "loss": 2.5066, "step": 2501000 }, { "epoch": 5.236418595304284, "grad_norm": 18.7869815826416, "learning_rate": 2.386788039738878e-05, "loss": 2.4818, "step": 2501500 }, { "epoch": 5.237465251029909, "grad_norm": 20.735183715820312, "learning_rate": 2.386263614089849e-05, "loss": 2.4821, "step": 2502000 }, { "epoch": 5.238511906755535, "grad_norm": 18.46427345275879, "learning_rate": 2.3857391884408196e-05, "loss": 2.4807, "step": 2502500 }, { "epoch": 5.23955856248116, "grad_norm": 18.06796646118164, "learning_rate": 2.3852147627917908e-05, "loss": 2.494, "step": 2503000 }, { "epoch": 5.2406052182067855, "grad_norm": 21.042512893676758, "learning_rate": 2.3846903371427615e-05, "loss": 2.5119, "step": 2503500 }, { "epoch": 5.241651873932411, "grad_norm": 13.94704532623291, "learning_rate": 2.3841659114937323e-05, "loss": 2.4827, "step": 2504000 }, { "epoch": 5.242698529658036, "grad_norm": 15.22958755493164, "learning_rate": 2.3836414858447027e-05, "loss": 2.502, "step": 2504500 }, { "epoch": 5.243745185383662, "grad_norm": 15.584726333618164, "learning_rate": 2.3831170601956738e-05, "loss": 2.4707, "step": 2505000 }, { "epoch": 5.244791841109287, "grad_norm": 20.291349411010742, "learning_rate": 2.3825926345466446e-05, "loss": 2.5008, "step": 2505500 }, { "epoch": 5.245838496834913, "grad_norm": 14.073310852050781, "learning_rate": 2.3820682088976154e-05, "loss": 2.51, "step": 2506000 }, { "epoch": 5.246885152560538, "grad_norm": 20.379785537719727, "learning_rate": 2.381543783248586e-05, "loss": 2.4923, "step": 2506500 }, { "epoch": 5.247931808286164, "grad_norm": 16.32353401184082, "learning_rate": 2.3810193575995572e-05, "loss": 2.4952, "step": 2507000 }, { "epoch": 5.248978464011789, "grad_norm": 14.7827730178833, "learning_rate": 2.380494931950528e-05, "loss": 2.4642, "step": 2507500 }, { "epoch": 5.2500251197374155, "grad_norm": 16.067588806152344, "learning_rate": 2.3799705063014988e-05, "loss": 2.4775, "step": 2508000 }, { "epoch": 5.251071775463041, "grad_norm": 17.874977111816406, "learning_rate": 2.3794460806524696e-05, "loss": 2.478, "step": 2508500 }, { "epoch": 5.252118431188666, "grad_norm": 15.323598861694336, "learning_rate": 2.3789216550034403e-05, "loss": 2.4844, "step": 2509000 }, { "epoch": 5.253165086914292, "grad_norm": 13.52985954284668, "learning_rate": 2.3783972293544114e-05, "loss": 2.4732, "step": 2509500 }, { "epoch": 5.254211742639917, "grad_norm": 15.739004135131836, "learning_rate": 2.377872803705382e-05, "loss": 2.4819, "step": 2510000 }, { "epoch": 5.255258398365543, "grad_norm": 19.570375442504883, "learning_rate": 2.3773483780563526e-05, "loss": 2.49, "step": 2510500 }, { "epoch": 5.256305054091168, "grad_norm": 20.742176055908203, "learning_rate": 2.3768239524073234e-05, "loss": 2.4772, "step": 2511000 }, { "epoch": 5.257351709816794, "grad_norm": 15.861637115478516, "learning_rate": 2.3762995267582945e-05, "loss": 2.4764, "step": 2511500 }, { "epoch": 5.258398365542419, "grad_norm": 15.745834350585938, "learning_rate": 2.3757751011092653e-05, "loss": 2.503, "step": 2512000 }, { "epoch": 5.259445021268045, "grad_norm": 15.776607513427734, "learning_rate": 2.375250675460236e-05, "loss": 2.4938, "step": 2512500 }, { "epoch": 5.26049167699367, "grad_norm": 13.051741600036621, "learning_rate": 2.3747262498112068e-05, "loss": 2.4934, "step": 2513000 }, { "epoch": 5.2615383327192955, "grad_norm": 17.036283493041992, "learning_rate": 2.3742018241621776e-05, "loss": 2.4824, "step": 2513500 }, { "epoch": 5.262584988444921, "grad_norm": 19.08275032043457, "learning_rate": 2.3736773985131487e-05, "loss": 2.4875, "step": 2514000 }, { "epoch": 5.263631644170546, "grad_norm": 17.708154678344727, "learning_rate": 2.3731529728641195e-05, "loss": 2.493, "step": 2514500 }, { "epoch": 5.264678299896172, "grad_norm": 13.874448776245117, "learning_rate": 2.3726285472150902e-05, "loss": 2.4893, "step": 2515000 }, { "epoch": 5.265724955621797, "grad_norm": 15.453471183776855, "learning_rate": 2.372104121566061e-05, "loss": 2.4871, "step": 2515500 }, { "epoch": 5.266771611347423, "grad_norm": 15.431546211242676, "learning_rate": 2.3715796959170318e-05, "loss": 2.4567, "step": 2516000 }, { "epoch": 5.267818267073048, "grad_norm": 16.280031204223633, "learning_rate": 2.3710552702680025e-05, "loss": 2.4857, "step": 2516500 }, { "epoch": 5.268864922798674, "grad_norm": 15.289142608642578, "learning_rate": 2.3705308446189733e-05, "loss": 2.4865, "step": 2517000 }, { "epoch": 5.269911578524299, "grad_norm": 14.491216659545898, "learning_rate": 2.370006418969944e-05, "loss": 2.4734, "step": 2517500 }, { "epoch": 5.270958234249925, "grad_norm": 14.43309497833252, "learning_rate": 2.3694819933209152e-05, "loss": 2.4696, "step": 2518000 }, { "epoch": 5.27200488997555, "grad_norm": 15.305180549621582, "learning_rate": 2.368957567671886e-05, "loss": 2.4689, "step": 2518500 }, { "epoch": 5.2730515457011755, "grad_norm": 15.673260688781738, "learning_rate": 2.3684331420228567e-05, "loss": 2.4812, "step": 2519000 }, { "epoch": 5.274098201426801, "grad_norm": 13.246280670166016, "learning_rate": 2.3679087163738275e-05, "loss": 2.4939, "step": 2519500 }, { "epoch": 5.275144857152426, "grad_norm": 17.08423614501953, "learning_rate": 2.3673842907247983e-05, "loss": 2.4801, "step": 2520000 }, { "epoch": 5.276191512878052, "grad_norm": 15.434536933898926, "learning_rate": 2.3668598650757694e-05, "loss": 2.4735, "step": 2520500 }, { "epoch": 5.277238168603677, "grad_norm": 16.64961814880371, "learning_rate": 2.36633543942674e-05, "loss": 2.491, "step": 2521000 }, { "epoch": 5.278284824329303, "grad_norm": 15.13710880279541, "learning_rate": 2.3658110137777106e-05, "loss": 2.4547, "step": 2521500 }, { "epoch": 5.279331480054928, "grad_norm": 15.775704383850098, "learning_rate": 2.3652865881286813e-05, "loss": 2.4777, "step": 2522000 }, { "epoch": 5.280378135780554, "grad_norm": 15.247715950012207, "learning_rate": 2.3647621624796524e-05, "loss": 2.4693, "step": 2522500 }, { "epoch": 5.281424791506179, "grad_norm": 16.746219635009766, "learning_rate": 2.3642377368306232e-05, "loss": 2.4874, "step": 2523000 }, { "epoch": 5.282471447231805, "grad_norm": 14.884650230407715, "learning_rate": 2.363713311181594e-05, "loss": 2.4825, "step": 2523500 }, { "epoch": 5.28351810295743, "grad_norm": 16.43604850769043, "learning_rate": 2.3631888855325648e-05, "loss": 2.5018, "step": 2524000 }, { "epoch": 5.2845647586830555, "grad_norm": 15.894998550415039, "learning_rate": 2.3626644598835355e-05, "loss": 2.4632, "step": 2524500 }, { "epoch": 5.285611414408681, "grad_norm": 16.014652252197266, "learning_rate": 2.3621400342345066e-05, "loss": 2.4807, "step": 2525000 }, { "epoch": 5.286658070134306, "grad_norm": 14.983975410461426, "learning_rate": 2.3616156085854774e-05, "loss": 2.5014, "step": 2525500 }, { "epoch": 5.287704725859932, "grad_norm": 16.420013427734375, "learning_rate": 2.361091182936448e-05, "loss": 2.473, "step": 2526000 }, { "epoch": 5.288751381585557, "grad_norm": 13.674647331237793, "learning_rate": 2.360566757287419e-05, "loss": 2.4702, "step": 2526500 }, { "epoch": 5.289798037311184, "grad_norm": 14.363327026367188, "learning_rate": 2.3600423316383897e-05, "loss": 2.4755, "step": 2527000 }, { "epoch": 5.290844693036809, "grad_norm": 14.169702529907227, "learning_rate": 2.3595179059893605e-05, "loss": 2.4681, "step": 2527500 }, { "epoch": 5.291891348762435, "grad_norm": 15.963385581970215, "learning_rate": 2.3589934803403312e-05, "loss": 2.4876, "step": 2528000 }, { "epoch": 5.29293800448806, "grad_norm": 16.807161331176758, "learning_rate": 2.358469054691302e-05, "loss": 2.4749, "step": 2528500 }, { "epoch": 5.2939846602136855, "grad_norm": 16.92003059387207, "learning_rate": 2.357944629042273e-05, "loss": 2.4995, "step": 2529000 }, { "epoch": 5.295031315939311, "grad_norm": 19.154296875, "learning_rate": 2.357420203393244e-05, "loss": 2.5001, "step": 2529500 }, { "epoch": 5.296077971664936, "grad_norm": 17.06239891052246, "learning_rate": 2.3568957777442147e-05, "loss": 2.4699, "step": 2530000 }, { "epoch": 5.297124627390562, "grad_norm": 15.479010581970215, "learning_rate": 2.3563713520951854e-05, "loss": 2.4641, "step": 2530500 }, { "epoch": 5.298171283116187, "grad_norm": 13.606451034545898, "learning_rate": 2.3558469264461562e-05, "loss": 2.4825, "step": 2531000 }, { "epoch": 5.299217938841813, "grad_norm": 17.06246566772461, "learning_rate": 2.3553225007971273e-05, "loss": 2.4656, "step": 2531500 }, { "epoch": 5.300264594567438, "grad_norm": 14.91499137878418, "learning_rate": 2.354798075148098e-05, "loss": 2.4738, "step": 2532000 }, { "epoch": 5.301311250293064, "grad_norm": 15.856934547424316, "learning_rate": 2.354273649499069e-05, "loss": 2.4882, "step": 2532500 }, { "epoch": 5.302357906018689, "grad_norm": 14.482473373413086, "learning_rate": 2.3537492238500393e-05, "loss": 2.4882, "step": 2533000 }, { "epoch": 5.303404561744315, "grad_norm": 15.045865058898926, "learning_rate": 2.3532247982010104e-05, "loss": 2.4664, "step": 2533500 }, { "epoch": 5.30445121746994, "grad_norm": 17.591814041137695, "learning_rate": 2.352700372551981e-05, "loss": 2.4872, "step": 2534000 }, { "epoch": 5.3054978731955655, "grad_norm": 19.984758377075195, "learning_rate": 2.352175946902952e-05, "loss": 2.4857, "step": 2534500 }, { "epoch": 5.306544528921191, "grad_norm": 14.65484619140625, "learning_rate": 2.3516515212539227e-05, "loss": 2.458, "step": 2535000 }, { "epoch": 5.3075911846468165, "grad_norm": 41.30550765991211, "learning_rate": 2.3511270956048938e-05, "loss": 2.5007, "step": 2535500 }, { "epoch": 5.308637840372442, "grad_norm": 15.421234130859375, "learning_rate": 2.3506026699558646e-05, "loss": 2.4808, "step": 2536000 }, { "epoch": 5.309684496098067, "grad_norm": 17.12590789794922, "learning_rate": 2.3500782443068353e-05, "loss": 2.4846, "step": 2536500 }, { "epoch": 5.310731151823693, "grad_norm": 14.247793197631836, "learning_rate": 2.349553818657806e-05, "loss": 2.4783, "step": 2537000 }, { "epoch": 5.311777807549318, "grad_norm": 15.97642707824707, "learning_rate": 2.349029393008777e-05, "loss": 2.4752, "step": 2537500 }, { "epoch": 5.312824463274944, "grad_norm": 19.313451766967773, "learning_rate": 2.348504967359748e-05, "loss": 2.475, "step": 2538000 }, { "epoch": 5.313871119000569, "grad_norm": 18.774730682373047, "learning_rate": 2.3479805417107184e-05, "loss": 2.4812, "step": 2538500 }, { "epoch": 5.314917774726195, "grad_norm": 14.746256828308105, "learning_rate": 2.3474561160616892e-05, "loss": 2.4917, "step": 2539000 }, { "epoch": 5.31596443045182, "grad_norm": 13.814191818237305, "learning_rate": 2.34693169041266e-05, "loss": 2.4815, "step": 2539500 }, { "epoch": 5.3170110861774456, "grad_norm": 16.784561157226562, "learning_rate": 2.346407264763631e-05, "loss": 2.4702, "step": 2540000 }, { "epoch": 5.318057741903071, "grad_norm": 18.266664505004883, "learning_rate": 2.3458828391146018e-05, "loss": 2.4924, "step": 2540500 }, { "epoch": 5.3191043976286965, "grad_norm": 16.18415069580078, "learning_rate": 2.3453584134655726e-05, "loss": 2.479, "step": 2541000 }, { "epoch": 5.320151053354322, "grad_norm": 15.620586395263672, "learning_rate": 2.3448339878165434e-05, "loss": 2.4871, "step": 2541500 }, { "epoch": 5.321197709079947, "grad_norm": 15.741308212280273, "learning_rate": 2.344309562167514e-05, "loss": 2.472, "step": 2542000 }, { "epoch": 5.322244364805573, "grad_norm": 14.725414276123047, "learning_rate": 2.3437851365184852e-05, "loss": 2.4854, "step": 2542500 }, { "epoch": 5.323291020531199, "grad_norm": 13.79758071899414, "learning_rate": 2.343260710869456e-05, "loss": 2.4659, "step": 2543000 }, { "epoch": 5.324337676256825, "grad_norm": 15.554767608642578, "learning_rate": 2.3427362852204268e-05, "loss": 2.4668, "step": 2543500 }, { "epoch": 5.32538433198245, "grad_norm": 15.63813304901123, "learning_rate": 2.3422118595713975e-05, "loss": 2.4843, "step": 2544000 }, { "epoch": 5.3264309877080755, "grad_norm": 14.974596977233887, "learning_rate": 2.3416874339223683e-05, "loss": 2.4805, "step": 2544500 }, { "epoch": 5.327477643433701, "grad_norm": 15.90799331665039, "learning_rate": 2.341163008273339e-05, "loss": 2.4612, "step": 2545000 }, { "epoch": 5.3285242991593265, "grad_norm": 16.14893341064453, "learning_rate": 2.34063858262431e-05, "loss": 2.4805, "step": 2545500 }, { "epoch": 5.329570954884952, "grad_norm": 16.7839412689209, "learning_rate": 2.3401141569752806e-05, "loss": 2.4934, "step": 2546000 }, { "epoch": 5.330617610610577, "grad_norm": 15.711363792419434, "learning_rate": 2.3395897313262517e-05, "loss": 2.4829, "step": 2546500 }, { "epoch": 5.331664266336203, "grad_norm": 16.178096771240234, "learning_rate": 2.3390653056772225e-05, "loss": 2.4896, "step": 2547000 }, { "epoch": 5.332710922061828, "grad_norm": 15.21695327758789, "learning_rate": 2.3385408800281933e-05, "loss": 2.4858, "step": 2547500 }, { "epoch": 5.333757577787454, "grad_norm": 19.335277557373047, "learning_rate": 2.338016454379164e-05, "loss": 2.4929, "step": 2548000 }, { "epoch": 5.334804233513079, "grad_norm": 17.16229248046875, "learning_rate": 2.3374920287301348e-05, "loss": 2.4943, "step": 2548500 }, { "epoch": 5.335850889238705, "grad_norm": 16.983915328979492, "learning_rate": 2.336967603081106e-05, "loss": 2.5069, "step": 2549000 }, { "epoch": 5.33689754496433, "grad_norm": 14.92564582824707, "learning_rate": 2.3364431774320767e-05, "loss": 2.4829, "step": 2549500 }, { "epoch": 5.337944200689956, "grad_norm": 14.917296409606934, "learning_rate": 2.335918751783047e-05, "loss": 2.4698, "step": 2550000 }, { "epoch": 5.338990856415581, "grad_norm": 16.448320388793945, "learning_rate": 2.335394326134018e-05, "loss": 2.4826, "step": 2550500 }, { "epoch": 5.3400375121412065, "grad_norm": 14.83991527557373, "learning_rate": 2.334869900484989e-05, "loss": 2.4685, "step": 2551000 }, { "epoch": 5.341084167866832, "grad_norm": 16.76102638244629, "learning_rate": 2.3343454748359598e-05, "loss": 2.4589, "step": 2551500 }, { "epoch": 5.342130823592457, "grad_norm": 14.905200958251953, "learning_rate": 2.3338210491869305e-05, "loss": 2.4901, "step": 2552000 }, { "epoch": 5.343177479318083, "grad_norm": 15.134827613830566, "learning_rate": 2.3332966235379013e-05, "loss": 2.4667, "step": 2552500 }, { "epoch": 5.344224135043708, "grad_norm": 17.111608505249023, "learning_rate": 2.3327721978888724e-05, "loss": 2.4856, "step": 2553000 }, { "epoch": 5.345270790769334, "grad_norm": 15.734007835388184, "learning_rate": 2.3322477722398432e-05, "loss": 2.4595, "step": 2553500 }, { "epoch": 5.346317446494959, "grad_norm": 22.219263076782227, "learning_rate": 2.331723346590814e-05, "loss": 2.4852, "step": 2554000 }, { "epoch": 5.347364102220585, "grad_norm": 16.771520614624023, "learning_rate": 2.3311989209417847e-05, "loss": 2.4771, "step": 2554500 }, { "epoch": 5.34841075794621, "grad_norm": 15.297616958618164, "learning_rate": 2.3306744952927555e-05, "loss": 2.4678, "step": 2555000 }, { "epoch": 5.349457413671836, "grad_norm": 15.789031982421875, "learning_rate": 2.3301500696437263e-05, "loss": 2.4817, "step": 2555500 }, { "epoch": 5.350504069397461, "grad_norm": 14.198441505432129, "learning_rate": 2.329625643994697e-05, "loss": 2.4679, "step": 2556000 }, { "epoch": 5.3515507251230865, "grad_norm": 21.077226638793945, "learning_rate": 2.3291012183456678e-05, "loss": 2.4886, "step": 2556500 }, { "epoch": 5.352597380848712, "grad_norm": 14.226508140563965, "learning_rate": 2.3285767926966386e-05, "loss": 2.4901, "step": 2557000 }, { "epoch": 5.353644036574337, "grad_norm": 16.548744201660156, "learning_rate": 2.3280523670476097e-05, "loss": 2.4796, "step": 2557500 }, { "epoch": 5.354690692299963, "grad_norm": 15.590274810791016, "learning_rate": 2.3275279413985804e-05, "loss": 2.4566, "step": 2558000 }, { "epoch": 5.355737348025588, "grad_norm": 15.222848892211914, "learning_rate": 2.3270035157495512e-05, "loss": 2.4785, "step": 2558500 }, { "epoch": 5.356784003751214, "grad_norm": 16.837966918945312, "learning_rate": 2.326479090100522e-05, "loss": 2.4703, "step": 2559000 }, { "epoch": 5.357830659476839, "grad_norm": 16.211402893066406, "learning_rate": 2.3259546644514927e-05, "loss": 2.4983, "step": 2559500 }, { "epoch": 5.358877315202465, "grad_norm": 15.87738037109375, "learning_rate": 2.325430238802464e-05, "loss": 2.4591, "step": 2560000 }, { "epoch": 5.35992397092809, "grad_norm": 15.19602108001709, "learning_rate": 2.3249058131534346e-05, "loss": 2.4905, "step": 2560500 }, { "epoch": 5.360970626653716, "grad_norm": 20.724010467529297, "learning_rate": 2.324381387504405e-05, "loss": 2.4691, "step": 2561000 }, { "epoch": 5.362017282379342, "grad_norm": 18.419815063476562, "learning_rate": 2.3238569618553758e-05, "loss": 2.4929, "step": 2561500 }, { "epoch": 5.363063938104967, "grad_norm": 17.20801544189453, "learning_rate": 2.323332536206347e-05, "loss": 2.4789, "step": 2562000 }, { "epoch": 5.364110593830593, "grad_norm": 23.55160903930664, "learning_rate": 2.3228081105573177e-05, "loss": 2.4718, "step": 2562500 }, { "epoch": 5.365157249556218, "grad_norm": 15.432973861694336, "learning_rate": 2.3222836849082885e-05, "loss": 2.4771, "step": 2563000 }, { "epoch": 5.366203905281844, "grad_norm": 15.522268295288086, "learning_rate": 2.3217592592592592e-05, "loss": 2.4735, "step": 2563500 }, { "epoch": 5.367250561007469, "grad_norm": 15.936020851135254, "learning_rate": 2.3212348336102303e-05, "loss": 2.4939, "step": 2564000 }, { "epoch": 5.368297216733095, "grad_norm": 19.09061050415039, "learning_rate": 2.320710407961201e-05, "loss": 2.4789, "step": 2564500 }, { "epoch": 5.36934387245872, "grad_norm": 15.185935020446777, "learning_rate": 2.320185982312172e-05, "loss": 2.478, "step": 2565000 }, { "epoch": 5.370390528184346, "grad_norm": 15.600902557373047, "learning_rate": 2.3196615566631427e-05, "loss": 2.489, "step": 2565500 }, { "epoch": 5.371437183909971, "grad_norm": 14.230485916137695, "learning_rate": 2.3191371310141134e-05, "loss": 2.4553, "step": 2566000 }, { "epoch": 5.3724838396355965, "grad_norm": 16.17791748046875, "learning_rate": 2.3186127053650845e-05, "loss": 2.481, "step": 2566500 }, { "epoch": 5.373530495361222, "grad_norm": 17.14748764038086, "learning_rate": 2.318088279716055e-05, "loss": 2.4802, "step": 2567000 }, { "epoch": 5.374577151086847, "grad_norm": 17.20151710510254, "learning_rate": 2.3175638540670257e-05, "loss": 2.4643, "step": 2567500 }, { "epoch": 5.375623806812473, "grad_norm": 19.851694107055664, "learning_rate": 2.3170394284179965e-05, "loss": 2.4783, "step": 2568000 }, { "epoch": 5.376670462538098, "grad_norm": 16.231792449951172, "learning_rate": 2.3165150027689676e-05, "loss": 2.4609, "step": 2568500 }, { "epoch": 5.377717118263724, "grad_norm": 15.293176651000977, "learning_rate": 2.3159905771199384e-05, "loss": 2.4829, "step": 2569000 }, { "epoch": 5.378763773989349, "grad_norm": 18.135271072387695, "learning_rate": 2.315466151470909e-05, "loss": 2.4803, "step": 2569500 }, { "epoch": 5.379810429714975, "grad_norm": 16.131685256958008, "learning_rate": 2.31494172582188e-05, "loss": 2.4798, "step": 2570000 }, { "epoch": 5.3808570854406, "grad_norm": 14.382246971130371, "learning_rate": 2.314417300172851e-05, "loss": 2.4792, "step": 2570500 }, { "epoch": 5.381903741166226, "grad_norm": 14.672411918640137, "learning_rate": 2.3138928745238218e-05, "loss": 2.4902, "step": 2571000 }, { "epoch": 5.382950396891851, "grad_norm": 26.467849731445312, "learning_rate": 2.3133684488747926e-05, "loss": 2.4721, "step": 2571500 }, { "epoch": 5.3839970526174765, "grad_norm": 16.82796859741211, "learning_rate": 2.3128440232257633e-05, "loss": 2.4864, "step": 2572000 }, { "epoch": 5.385043708343102, "grad_norm": 14.534822463989258, "learning_rate": 2.312319597576734e-05, "loss": 2.4793, "step": 2572500 }, { "epoch": 5.386090364068727, "grad_norm": 19.83770751953125, "learning_rate": 2.311795171927705e-05, "loss": 2.4773, "step": 2573000 }, { "epoch": 5.387137019794353, "grad_norm": 13.74590015411377, "learning_rate": 2.3112707462786756e-05, "loss": 2.4604, "step": 2573500 }, { "epoch": 5.388183675519978, "grad_norm": 15.935447692871094, "learning_rate": 2.3107463206296464e-05, "loss": 2.4777, "step": 2574000 }, { "epoch": 5.389230331245604, "grad_norm": 12.644477844238281, "learning_rate": 2.3102218949806172e-05, "loss": 2.469, "step": 2574500 }, { "epoch": 5.390276986971229, "grad_norm": 16.859622955322266, "learning_rate": 2.3096974693315883e-05, "loss": 2.4881, "step": 2575000 }, { "epoch": 5.391323642696855, "grad_norm": 15.998077392578125, "learning_rate": 2.309173043682559e-05, "loss": 2.5026, "step": 2575500 }, { "epoch": 5.39237029842248, "grad_norm": 16.946521759033203, "learning_rate": 2.3086486180335298e-05, "loss": 2.477, "step": 2576000 }, { "epoch": 5.393416954148106, "grad_norm": 14.611058235168457, "learning_rate": 2.3081241923845006e-05, "loss": 2.4702, "step": 2576500 }, { "epoch": 5.394463609873731, "grad_norm": 15.509187698364258, "learning_rate": 2.3075997667354714e-05, "loss": 2.4833, "step": 2577000 }, { "epoch": 5.3955102655993565, "grad_norm": 14.283171653747559, "learning_rate": 2.3070753410864425e-05, "loss": 2.504, "step": 2577500 }, { "epoch": 5.396556921324983, "grad_norm": 18.282325744628906, "learning_rate": 2.306550915437413e-05, "loss": 2.4935, "step": 2578000 }, { "epoch": 5.397603577050608, "grad_norm": 13.577821731567383, "learning_rate": 2.3060264897883837e-05, "loss": 2.4757, "step": 2578500 }, { "epoch": 5.398650232776234, "grad_norm": 17.346256256103516, "learning_rate": 2.3055020641393544e-05, "loss": 2.4639, "step": 2579000 }, { "epoch": 5.399696888501859, "grad_norm": 15.813921928405762, "learning_rate": 2.3049776384903255e-05, "loss": 2.4753, "step": 2579500 }, { "epoch": 5.400743544227485, "grad_norm": 16.894420623779297, "learning_rate": 2.3044532128412963e-05, "loss": 2.4636, "step": 2580000 }, { "epoch": 5.40179019995311, "grad_norm": 17.275354385375977, "learning_rate": 2.303928787192267e-05, "loss": 2.471, "step": 2580500 }, { "epoch": 5.402836855678736, "grad_norm": 14.364837646484375, "learning_rate": 2.303404361543238e-05, "loss": 2.4555, "step": 2581000 }, { "epoch": 5.403883511404361, "grad_norm": 16.22454261779785, "learning_rate": 2.302879935894209e-05, "loss": 2.4862, "step": 2581500 }, { "epoch": 5.4049301671299865, "grad_norm": 17.823461532592773, "learning_rate": 2.3023555102451797e-05, "loss": 2.4762, "step": 2582000 }, { "epoch": 5.405976822855612, "grad_norm": 14.985087394714355, "learning_rate": 2.3018310845961505e-05, "loss": 2.4647, "step": 2582500 }, { "epoch": 5.407023478581237, "grad_norm": 18.033815383911133, "learning_rate": 2.3013066589471213e-05, "loss": 2.491, "step": 2583000 }, { "epoch": 5.408070134306863, "grad_norm": 14.702672958374023, "learning_rate": 2.300782233298092e-05, "loss": 2.4533, "step": 2583500 }, { "epoch": 5.409116790032488, "grad_norm": 23.589792251586914, "learning_rate": 2.3002578076490628e-05, "loss": 2.4848, "step": 2584000 }, { "epoch": 5.410163445758114, "grad_norm": 16.57327651977539, "learning_rate": 2.2997333820000336e-05, "loss": 2.4851, "step": 2584500 }, { "epoch": 5.411210101483739, "grad_norm": 14.59026050567627, "learning_rate": 2.2992089563510043e-05, "loss": 2.4603, "step": 2585000 }, { "epoch": 5.412256757209365, "grad_norm": 13.534834861755371, "learning_rate": 2.298684530701975e-05, "loss": 2.4618, "step": 2585500 }, { "epoch": 5.41330341293499, "grad_norm": 18.451425552368164, "learning_rate": 2.2981601050529462e-05, "loss": 2.4833, "step": 2586000 }, { "epoch": 5.414350068660616, "grad_norm": 16.92467498779297, "learning_rate": 2.297635679403917e-05, "loss": 2.4834, "step": 2586500 }, { "epoch": 5.415396724386241, "grad_norm": 14.653650283813477, "learning_rate": 2.2971112537548878e-05, "loss": 2.4728, "step": 2587000 }, { "epoch": 5.4164433801118665, "grad_norm": 17.173643112182617, "learning_rate": 2.2965868281058585e-05, "loss": 2.4772, "step": 2587500 }, { "epoch": 5.417490035837492, "grad_norm": 15.610124588012695, "learning_rate": 2.2960624024568293e-05, "loss": 2.4774, "step": 2588000 }, { "epoch": 5.4185366915631175, "grad_norm": 17.872753143310547, "learning_rate": 2.2955379768078004e-05, "loss": 2.4812, "step": 2588500 }, { "epoch": 5.419583347288743, "grad_norm": 18.798616409301758, "learning_rate": 2.2950135511587712e-05, "loss": 2.4902, "step": 2589000 }, { "epoch": 5.420630003014368, "grad_norm": 18.25033187866211, "learning_rate": 2.2944891255097416e-05, "loss": 2.4837, "step": 2589500 }, { "epoch": 5.421676658739994, "grad_norm": 14.681889533996582, "learning_rate": 2.2939646998607127e-05, "loss": 2.4813, "step": 2590000 }, { "epoch": 5.422723314465619, "grad_norm": 15.42328929901123, "learning_rate": 2.2934402742116835e-05, "loss": 2.4764, "step": 2590500 }, { "epoch": 5.423769970191245, "grad_norm": 13.440685272216797, "learning_rate": 2.2929158485626542e-05, "loss": 2.4877, "step": 2591000 }, { "epoch": 5.42481662591687, "grad_norm": 16.566814422607422, "learning_rate": 2.292391422913625e-05, "loss": 2.4574, "step": 2591500 }, { "epoch": 5.425863281642496, "grad_norm": 13.853599548339844, "learning_rate": 2.2918669972645958e-05, "loss": 2.4552, "step": 2592000 }, { "epoch": 5.426909937368121, "grad_norm": 15.090597152709961, "learning_rate": 2.291342571615567e-05, "loss": 2.4779, "step": 2592500 }, { "epoch": 5.427956593093747, "grad_norm": 15.890128135681152, "learning_rate": 2.2908181459665377e-05, "loss": 2.4849, "step": 2593000 }, { "epoch": 5.429003248819372, "grad_norm": 16.626211166381836, "learning_rate": 2.2902937203175084e-05, "loss": 2.4772, "step": 2593500 }, { "epoch": 5.4300499045449975, "grad_norm": 16.111597061157227, "learning_rate": 2.2897692946684792e-05, "loss": 2.4645, "step": 2594000 }, { "epoch": 5.431096560270623, "grad_norm": 16.32441520690918, "learning_rate": 2.28924486901945e-05, "loss": 2.4586, "step": 2594500 }, { "epoch": 5.432143215996248, "grad_norm": 16.45210075378418, "learning_rate": 2.288720443370421e-05, "loss": 2.4594, "step": 2595000 }, { "epoch": 5.433189871721874, "grad_norm": 19.31691551208496, "learning_rate": 2.2881960177213915e-05, "loss": 2.473, "step": 2595500 }, { "epoch": 5.434236527447499, "grad_norm": 13.423605918884277, "learning_rate": 2.2876715920723623e-05, "loss": 2.4933, "step": 2596000 }, { "epoch": 5.435283183173126, "grad_norm": 18.86305046081543, "learning_rate": 2.287147166423333e-05, "loss": 2.4718, "step": 2596500 }, { "epoch": 5.436329838898751, "grad_norm": 16.81142234802246, "learning_rate": 2.286622740774304e-05, "loss": 2.486, "step": 2597000 }, { "epoch": 5.4373764946243766, "grad_norm": 15.190495491027832, "learning_rate": 2.286098315125275e-05, "loss": 2.4755, "step": 2597500 }, { "epoch": 5.438423150350002, "grad_norm": 21.08359718322754, "learning_rate": 2.2855738894762457e-05, "loss": 2.4778, "step": 2598000 }, { "epoch": 5.4394698060756275, "grad_norm": 15.813779830932617, "learning_rate": 2.2850494638272165e-05, "loss": 2.4877, "step": 2598500 }, { "epoch": 5.440516461801253, "grad_norm": 15.00043773651123, "learning_rate": 2.2845250381781876e-05, "loss": 2.468, "step": 2599000 }, { "epoch": 5.441563117526878, "grad_norm": 14.413969993591309, "learning_rate": 2.2840006125291583e-05, "loss": 2.4696, "step": 2599500 }, { "epoch": 5.442609773252504, "grad_norm": 16.819683074951172, "learning_rate": 2.283476186880129e-05, "loss": 2.4776, "step": 2600000 }, { "epoch": 5.443656428978129, "grad_norm": 13.471951484680176, "learning_rate": 2.2829517612311e-05, "loss": 2.4744, "step": 2600500 }, { "epoch": 5.444703084703755, "grad_norm": 14.837113380432129, "learning_rate": 2.2824273355820706e-05, "loss": 2.4641, "step": 2601000 }, { "epoch": 5.44574974042938, "grad_norm": 16.82538414001465, "learning_rate": 2.2819029099330414e-05, "loss": 2.5019, "step": 2601500 }, { "epoch": 5.446796396155006, "grad_norm": 19.431285858154297, "learning_rate": 2.2813784842840122e-05, "loss": 2.4761, "step": 2602000 }, { "epoch": 5.447843051880631, "grad_norm": 14.089606285095215, "learning_rate": 2.280854058634983e-05, "loss": 2.4792, "step": 2602500 }, { "epoch": 5.448889707606257, "grad_norm": 16.174850463867188, "learning_rate": 2.2803296329859537e-05, "loss": 2.4523, "step": 2603000 }, { "epoch": 5.449936363331882, "grad_norm": 16.341506958007812, "learning_rate": 2.279805207336925e-05, "loss": 2.4475, "step": 2603500 }, { "epoch": 5.4509830190575075, "grad_norm": 15.6166353225708, "learning_rate": 2.2792807816878956e-05, "loss": 2.487, "step": 2604000 }, { "epoch": 5.452029674783133, "grad_norm": 17.578134536743164, "learning_rate": 2.2787563560388664e-05, "loss": 2.4688, "step": 2604500 }, { "epoch": 5.453076330508758, "grad_norm": 14.185674667358398, "learning_rate": 2.278231930389837e-05, "loss": 2.4797, "step": 2605000 }, { "epoch": 5.454122986234384, "grad_norm": 21.059602737426758, "learning_rate": 2.277707504740808e-05, "loss": 2.4799, "step": 2605500 }, { "epoch": 5.455169641960009, "grad_norm": 17.176815032958984, "learning_rate": 2.277183079091779e-05, "loss": 2.4911, "step": 2606000 }, { "epoch": 5.456216297685635, "grad_norm": 16.147703170776367, "learning_rate": 2.2766586534427494e-05, "loss": 2.4783, "step": 2606500 }, { "epoch": 5.45726295341126, "grad_norm": 16.463031768798828, "learning_rate": 2.2761342277937202e-05, "loss": 2.462, "step": 2607000 }, { "epoch": 5.458309609136886, "grad_norm": 17.175331115722656, "learning_rate": 2.2756098021446913e-05, "loss": 2.477, "step": 2607500 }, { "epoch": 5.459356264862511, "grad_norm": 16.245710372924805, "learning_rate": 2.275085376495662e-05, "loss": 2.4822, "step": 2608000 }, { "epoch": 5.460402920588137, "grad_norm": 18.791606903076172, "learning_rate": 2.274560950846633e-05, "loss": 2.4728, "step": 2608500 }, { "epoch": 5.461449576313762, "grad_norm": 19.688941955566406, "learning_rate": 2.2740365251976036e-05, "loss": 2.4789, "step": 2609000 }, { "epoch": 5.4624962320393875, "grad_norm": 15.942663192749023, "learning_rate": 2.2735120995485744e-05, "loss": 2.4868, "step": 2609500 }, { "epoch": 5.463542887765013, "grad_norm": 16.627307891845703, "learning_rate": 2.2729876738995455e-05, "loss": 2.4747, "step": 2610000 }, { "epoch": 5.464589543490638, "grad_norm": 14.691227912902832, "learning_rate": 2.2724632482505163e-05, "loss": 2.4698, "step": 2610500 }, { "epoch": 5.465636199216264, "grad_norm": 13.92292308807373, "learning_rate": 2.271938822601487e-05, "loss": 2.4577, "step": 2611000 }, { "epoch": 5.466682854941889, "grad_norm": 15.376433372497559, "learning_rate": 2.2714143969524578e-05, "loss": 2.478, "step": 2611500 }, { "epoch": 5.467729510667515, "grad_norm": 14.99777603149414, "learning_rate": 2.2708899713034286e-05, "loss": 2.4916, "step": 2612000 }, { "epoch": 5.46877616639314, "grad_norm": 15.869207382202148, "learning_rate": 2.2703655456543994e-05, "loss": 2.4884, "step": 2612500 }, { "epoch": 5.469822822118767, "grad_norm": 17.124746322631836, "learning_rate": 2.26984112000537e-05, "loss": 2.4639, "step": 2613000 }, { "epoch": 5.470869477844392, "grad_norm": 19.609956741333008, "learning_rate": 2.269316694356341e-05, "loss": 2.4654, "step": 2613500 }, { "epoch": 5.4719161335700175, "grad_norm": 17.34223747253418, "learning_rate": 2.2687922687073117e-05, "loss": 2.4649, "step": 2614000 }, { "epoch": 5.472962789295643, "grad_norm": 15.15987491607666, "learning_rate": 2.2682678430582828e-05, "loss": 2.4961, "step": 2614500 }, { "epoch": 5.474009445021268, "grad_norm": 12.217729568481445, "learning_rate": 2.2677434174092535e-05, "loss": 2.4929, "step": 2615000 }, { "epoch": 5.475056100746894, "grad_norm": 15.741926193237305, "learning_rate": 2.2672189917602243e-05, "loss": 2.4703, "step": 2615500 }, { "epoch": 5.476102756472519, "grad_norm": 15.662492752075195, "learning_rate": 2.266694566111195e-05, "loss": 2.4756, "step": 2616000 }, { "epoch": 5.477149412198145, "grad_norm": 13.922471046447754, "learning_rate": 2.2661701404621662e-05, "loss": 2.4678, "step": 2616500 }, { "epoch": 5.47819606792377, "grad_norm": 18.026241302490234, "learning_rate": 2.265645714813137e-05, "loss": 2.4597, "step": 2617000 }, { "epoch": 5.479242723649396, "grad_norm": 20.27649688720703, "learning_rate": 2.2651212891641077e-05, "loss": 2.4807, "step": 2617500 }, { "epoch": 5.480289379375021, "grad_norm": 19.081039428710938, "learning_rate": 2.264596863515078e-05, "loss": 2.4571, "step": 2618000 }, { "epoch": 5.481336035100647, "grad_norm": 15.892644882202148, "learning_rate": 2.2640724378660493e-05, "loss": 2.4646, "step": 2618500 }, { "epoch": 5.482382690826272, "grad_norm": 16.022714614868164, "learning_rate": 2.26354801221702e-05, "loss": 2.4725, "step": 2619000 }, { "epoch": 5.4834293465518975, "grad_norm": 15.778480529785156, "learning_rate": 2.2630235865679908e-05, "loss": 2.4786, "step": 2619500 }, { "epoch": 5.484476002277523, "grad_norm": 15.66960620880127, "learning_rate": 2.2624991609189616e-05, "loss": 2.4623, "step": 2620000 }, { "epoch": 5.485522658003148, "grad_norm": 15.286083221435547, "learning_rate": 2.2619747352699323e-05, "loss": 2.4962, "step": 2620500 }, { "epoch": 5.486569313728774, "grad_norm": 13.919151306152344, "learning_rate": 2.2614503096209034e-05, "loss": 2.487, "step": 2621000 }, { "epoch": 5.487615969454399, "grad_norm": 16.86395263671875, "learning_rate": 2.2609258839718742e-05, "loss": 2.4903, "step": 2621500 }, { "epoch": 5.488662625180025, "grad_norm": 13.327149391174316, "learning_rate": 2.260401458322845e-05, "loss": 2.477, "step": 2622000 }, { "epoch": 5.48970928090565, "grad_norm": 15.403400421142578, "learning_rate": 2.2598770326738158e-05, "loss": 2.4722, "step": 2622500 }, { "epoch": 5.490755936631276, "grad_norm": 15.869268417358398, "learning_rate": 2.2593526070247865e-05, "loss": 2.4705, "step": 2623000 }, { "epoch": 5.491802592356901, "grad_norm": 18.92465591430664, "learning_rate": 2.2588281813757573e-05, "loss": 2.4701, "step": 2623500 }, { "epoch": 5.492849248082527, "grad_norm": 15.69943904876709, "learning_rate": 2.258303755726728e-05, "loss": 2.4761, "step": 2624000 }, { "epoch": 5.493895903808152, "grad_norm": 15.274205207824707, "learning_rate": 2.2577793300776988e-05, "loss": 2.4655, "step": 2624500 }, { "epoch": 5.4949425595337775, "grad_norm": 18.262903213500977, "learning_rate": 2.25725490442867e-05, "loss": 2.4747, "step": 2625000 }, { "epoch": 5.495989215259403, "grad_norm": 16.483823776245117, "learning_rate": 2.2567304787796407e-05, "loss": 2.4912, "step": 2625500 }, { "epoch": 5.497035870985028, "grad_norm": 18.006074905395508, "learning_rate": 2.2562060531306115e-05, "loss": 2.4556, "step": 2626000 }, { "epoch": 5.498082526710654, "grad_norm": 18.04937171936035, "learning_rate": 2.2556816274815822e-05, "loss": 2.4556, "step": 2626500 }, { "epoch": 5.499129182436279, "grad_norm": 18.367019653320312, "learning_rate": 2.255157201832553e-05, "loss": 2.4876, "step": 2627000 }, { "epoch": 5.500175838161905, "grad_norm": 17.31007957458496, "learning_rate": 2.254632776183524e-05, "loss": 2.4665, "step": 2627500 }, { "epoch": 5.50122249388753, "grad_norm": 13.64255142211914, "learning_rate": 2.254108350534495e-05, "loss": 2.4564, "step": 2628000 }, { "epoch": 5.502269149613156, "grad_norm": 23.269718170166016, "learning_rate": 2.2535839248854657e-05, "loss": 2.4549, "step": 2628500 }, { "epoch": 5.503315805338781, "grad_norm": 15.071203231811523, "learning_rate": 2.2530594992364364e-05, "loss": 2.4704, "step": 2629000 }, { "epoch": 5.504362461064407, "grad_norm": 14.211538314819336, "learning_rate": 2.2525350735874072e-05, "loss": 2.4669, "step": 2629500 }, { "epoch": 5.505409116790032, "grad_norm": 16.89033317565918, "learning_rate": 2.252010647938378e-05, "loss": 2.4936, "step": 2630000 }, { "epoch": 5.5064557725156575, "grad_norm": 23.056596755981445, "learning_rate": 2.2514862222893487e-05, "loss": 2.4606, "step": 2630500 }, { "epoch": 5.507502428241283, "grad_norm": 14.862422943115234, "learning_rate": 2.2509617966403195e-05, "loss": 2.4929, "step": 2631000 }, { "epoch": 5.5085490839669085, "grad_norm": 15.280682563781738, "learning_rate": 2.2504373709912903e-05, "loss": 2.4613, "step": 2631500 }, { "epoch": 5.509595739692535, "grad_norm": 20.379806518554688, "learning_rate": 2.2499129453422614e-05, "loss": 2.4676, "step": 2632000 }, { "epoch": 5.51064239541816, "grad_norm": 17.228195190429688, "learning_rate": 2.249388519693232e-05, "loss": 2.4762, "step": 2632500 }, { "epoch": 5.511689051143786, "grad_norm": 18.620040893554688, "learning_rate": 2.248864094044203e-05, "loss": 2.4739, "step": 2633000 }, { "epoch": 5.512735706869411, "grad_norm": 15.213255882263184, "learning_rate": 2.2483396683951737e-05, "loss": 2.4687, "step": 2633500 }, { "epoch": 5.513782362595037, "grad_norm": 18.041728973388672, "learning_rate": 2.2478152427461448e-05, "loss": 2.4587, "step": 2634000 }, { "epoch": 5.514829018320662, "grad_norm": 15.344278335571289, "learning_rate": 2.2472908170971156e-05, "loss": 2.481, "step": 2634500 }, { "epoch": 5.5158756740462875, "grad_norm": 16.540685653686523, "learning_rate": 2.246766391448086e-05, "loss": 2.4865, "step": 2635000 }, { "epoch": 5.516922329771913, "grad_norm": 17.636089324951172, "learning_rate": 2.2462419657990568e-05, "loss": 2.4589, "step": 2635500 }, { "epoch": 5.5179689854975384, "grad_norm": 17.524282455444336, "learning_rate": 2.245717540150028e-05, "loss": 2.4605, "step": 2636000 }, { "epoch": 5.519015641223164, "grad_norm": 16.751943588256836, "learning_rate": 2.2451931145009986e-05, "loss": 2.4858, "step": 2636500 }, { "epoch": 5.520062296948789, "grad_norm": 15.742988586425781, "learning_rate": 2.2446686888519694e-05, "loss": 2.4672, "step": 2637000 }, { "epoch": 5.521108952674415, "grad_norm": 15.22386646270752, "learning_rate": 2.2441442632029402e-05, "loss": 2.4615, "step": 2637500 }, { "epoch": 5.52215560840004, "grad_norm": 16.60118865966797, "learning_rate": 2.243619837553911e-05, "loss": 2.4652, "step": 2638000 }, { "epoch": 5.523202264125666, "grad_norm": 16.473033905029297, "learning_rate": 2.243095411904882e-05, "loss": 2.4869, "step": 2638500 }, { "epoch": 5.524248919851291, "grad_norm": 17.426132202148438, "learning_rate": 2.2425709862558528e-05, "loss": 2.4566, "step": 2639000 }, { "epoch": 5.525295575576917, "grad_norm": 15.331143379211426, "learning_rate": 2.2420465606068236e-05, "loss": 2.469, "step": 2639500 }, { "epoch": 5.526342231302542, "grad_norm": 16.67511749267578, "learning_rate": 2.2415221349577944e-05, "loss": 2.4601, "step": 2640000 }, { "epoch": 5.5273888870281676, "grad_norm": 14.28862476348877, "learning_rate": 2.240997709308765e-05, "loss": 2.4632, "step": 2640500 }, { "epoch": 5.528435542753793, "grad_norm": 16.23735237121582, "learning_rate": 2.240473283659736e-05, "loss": 2.4682, "step": 2641000 }, { "epoch": 5.5294821984794185, "grad_norm": 18.909513473510742, "learning_rate": 2.2399488580107067e-05, "loss": 2.4777, "step": 2641500 }, { "epoch": 5.530528854205044, "grad_norm": 16.079389572143555, "learning_rate": 2.2394244323616774e-05, "loss": 2.4705, "step": 2642000 }, { "epoch": 5.531575509930669, "grad_norm": 16.010908126831055, "learning_rate": 2.2389000067126482e-05, "loss": 2.4667, "step": 2642500 }, { "epoch": 5.532622165656295, "grad_norm": 14.930657386779785, "learning_rate": 2.2383755810636193e-05, "loss": 2.4797, "step": 2643000 }, { "epoch": 5.53366882138192, "grad_norm": 15.496732711791992, "learning_rate": 2.23785115541459e-05, "loss": 2.4873, "step": 2643500 }, { "epoch": 5.534715477107546, "grad_norm": 18.058542251586914, "learning_rate": 2.237326729765561e-05, "loss": 2.4673, "step": 2644000 }, { "epoch": 5.535762132833171, "grad_norm": 18.060684204101562, "learning_rate": 2.2368023041165316e-05, "loss": 2.4813, "step": 2644500 }, { "epoch": 5.536808788558797, "grad_norm": 15.945963859558105, "learning_rate": 2.2362778784675027e-05, "loss": 2.4765, "step": 2645000 }, { "epoch": 5.537855444284422, "grad_norm": 12.96367359161377, "learning_rate": 2.2357534528184735e-05, "loss": 2.4626, "step": 2645500 }, { "epoch": 5.538902100010048, "grad_norm": 16.95668601989746, "learning_rate": 2.2352290271694443e-05, "loss": 2.478, "step": 2646000 }, { "epoch": 5.539948755735673, "grad_norm": 14.195844650268555, "learning_rate": 2.2347046015204147e-05, "loss": 2.4677, "step": 2646500 }, { "epoch": 5.5409954114612985, "grad_norm": 14.941856384277344, "learning_rate": 2.2341801758713858e-05, "loss": 2.4609, "step": 2647000 }, { "epoch": 5.542042067186925, "grad_norm": 14.641654968261719, "learning_rate": 2.2336557502223566e-05, "loss": 2.4499, "step": 2647500 }, { "epoch": 5.54308872291255, "grad_norm": 13.541263580322266, "learning_rate": 2.2331313245733274e-05, "loss": 2.4712, "step": 2648000 }, { "epoch": 5.544135378638176, "grad_norm": 15.563417434692383, "learning_rate": 2.232606898924298e-05, "loss": 2.4686, "step": 2648500 }, { "epoch": 5.545182034363801, "grad_norm": 17.219972610473633, "learning_rate": 2.232082473275269e-05, "loss": 2.4694, "step": 2649000 }, { "epoch": 5.546228690089427, "grad_norm": 17.2396183013916, "learning_rate": 2.23155804762624e-05, "loss": 2.4523, "step": 2649500 }, { "epoch": 5.547275345815052, "grad_norm": 15.40766716003418, "learning_rate": 2.2310336219772108e-05, "loss": 2.4649, "step": 2650000 }, { "epoch": 5.548322001540678, "grad_norm": 14.050402641296387, "learning_rate": 2.2305091963281815e-05, "loss": 2.4542, "step": 2650500 }, { "epoch": 5.549368657266303, "grad_norm": 14.545479774475098, "learning_rate": 2.2299847706791523e-05, "loss": 2.4881, "step": 2651000 }, { "epoch": 5.5504153129919285, "grad_norm": 19.620311737060547, "learning_rate": 2.2294603450301234e-05, "loss": 2.4744, "step": 2651500 }, { "epoch": 5.551461968717554, "grad_norm": 20.056499481201172, "learning_rate": 2.228935919381094e-05, "loss": 2.4591, "step": 2652000 }, { "epoch": 5.552508624443179, "grad_norm": 14.460000038146973, "learning_rate": 2.2284114937320646e-05, "loss": 2.4706, "step": 2652500 }, { "epoch": 5.553555280168805, "grad_norm": 15.484172821044922, "learning_rate": 2.2278870680830354e-05, "loss": 2.4533, "step": 2653000 }, { "epoch": 5.55460193589443, "grad_norm": 16.513080596923828, "learning_rate": 2.2273626424340065e-05, "loss": 2.478, "step": 2653500 }, { "epoch": 5.555648591620056, "grad_norm": 14.42464542388916, "learning_rate": 2.2268382167849773e-05, "loss": 2.4554, "step": 2654000 }, { "epoch": 5.556695247345681, "grad_norm": 20.349071502685547, "learning_rate": 2.226313791135948e-05, "loss": 2.4798, "step": 2654500 }, { "epoch": 5.557741903071307, "grad_norm": 17.42767906188965, "learning_rate": 2.2257893654869188e-05, "loss": 2.4753, "step": 2655000 }, { "epoch": 5.558788558796932, "grad_norm": 13.456307411193848, "learning_rate": 2.2252649398378896e-05, "loss": 2.4653, "step": 2655500 }, { "epoch": 5.559835214522558, "grad_norm": 19.393047332763672, "learning_rate": 2.2247405141888607e-05, "loss": 2.4756, "step": 2656000 }, { "epoch": 5.560881870248183, "grad_norm": 15.44996452331543, "learning_rate": 2.2242160885398314e-05, "loss": 2.4785, "step": 2656500 }, { "epoch": 5.5619285259738085, "grad_norm": 16.9460391998291, "learning_rate": 2.2236916628908022e-05, "loss": 2.4741, "step": 2657000 }, { "epoch": 5.562975181699434, "grad_norm": 18.21233367919922, "learning_rate": 2.2231672372417726e-05, "loss": 2.4818, "step": 2657500 }, { "epoch": 5.564021837425059, "grad_norm": 16.290498733520508, "learning_rate": 2.2226428115927437e-05, "loss": 2.4716, "step": 2658000 }, { "epoch": 5.565068493150685, "grad_norm": 15.099666595458984, "learning_rate": 2.2221183859437145e-05, "loss": 2.4539, "step": 2658500 }, { "epoch": 5.56611514887631, "grad_norm": 17.321836471557617, "learning_rate": 2.2215939602946853e-05, "loss": 2.4811, "step": 2659000 }, { "epoch": 5.567161804601936, "grad_norm": 16.47538185119629, "learning_rate": 2.221069534645656e-05, "loss": 2.4928, "step": 2659500 }, { "epoch": 5.568208460327561, "grad_norm": 17.669384002685547, "learning_rate": 2.2205451089966268e-05, "loss": 2.4652, "step": 2660000 }, { "epoch": 5.569255116053187, "grad_norm": 16.542909622192383, "learning_rate": 2.220020683347598e-05, "loss": 2.4474, "step": 2660500 }, { "epoch": 5.570301771778812, "grad_norm": 17.716529846191406, "learning_rate": 2.2194962576985687e-05, "loss": 2.4639, "step": 2661000 }, { "epoch": 5.571348427504438, "grad_norm": 15.947731018066406, "learning_rate": 2.2189718320495395e-05, "loss": 2.4691, "step": 2661500 }, { "epoch": 5.572395083230063, "grad_norm": 21.284549713134766, "learning_rate": 2.2184474064005102e-05, "loss": 2.4638, "step": 2662000 }, { "epoch": 5.5734417389556885, "grad_norm": 14.865070343017578, "learning_rate": 2.2179229807514813e-05, "loss": 2.4822, "step": 2662500 }, { "epoch": 5.574488394681314, "grad_norm": 17.190296173095703, "learning_rate": 2.217398555102452e-05, "loss": 2.4666, "step": 2663000 }, { "epoch": 5.575535050406939, "grad_norm": 19.21499252319336, "learning_rate": 2.2168741294534225e-05, "loss": 2.462, "step": 2663500 }, { "epoch": 5.576581706132565, "grad_norm": 18.78744125366211, "learning_rate": 2.2163497038043933e-05, "loss": 2.4477, "step": 2664000 }, { "epoch": 5.57762836185819, "grad_norm": 17.680904388427734, "learning_rate": 2.2158252781553644e-05, "loss": 2.4659, "step": 2664500 }, { "epoch": 5.578675017583816, "grad_norm": 15.870976448059082, "learning_rate": 2.2153008525063352e-05, "loss": 2.4683, "step": 2665000 }, { "epoch": 5.579721673309441, "grad_norm": 14.499258041381836, "learning_rate": 2.214776426857306e-05, "loss": 2.4578, "step": 2665500 }, { "epoch": 5.580768329035067, "grad_norm": 16.95970344543457, "learning_rate": 2.2142520012082767e-05, "loss": 2.4751, "step": 2666000 }, { "epoch": 5.581814984760692, "grad_norm": 22.532569885253906, "learning_rate": 2.2137275755592475e-05, "loss": 2.4599, "step": 2666500 }, { "epoch": 5.5828616404863185, "grad_norm": 18.331592559814453, "learning_rate": 2.2132031499102186e-05, "loss": 2.4582, "step": 2667000 }, { "epoch": 5.583908296211944, "grad_norm": 14.679238319396973, "learning_rate": 2.2126787242611894e-05, "loss": 2.4578, "step": 2667500 }, { "epoch": 5.584954951937569, "grad_norm": 15.522576332092285, "learning_rate": 2.21215429861216e-05, "loss": 2.4695, "step": 2668000 }, { "epoch": 5.586001607663195, "grad_norm": 16.880813598632812, "learning_rate": 2.211629872963131e-05, "loss": 2.4602, "step": 2668500 }, { "epoch": 5.58704826338882, "grad_norm": 15.892890930175781, "learning_rate": 2.2111054473141017e-05, "loss": 2.497, "step": 2669000 }, { "epoch": 5.588094919114446, "grad_norm": 13.409199714660645, "learning_rate": 2.2105810216650725e-05, "loss": 2.468, "step": 2669500 }, { "epoch": 5.589141574840071, "grad_norm": 14.354802131652832, "learning_rate": 2.2100565960160432e-05, "loss": 2.4616, "step": 2670000 }, { "epoch": 5.590188230565697, "grad_norm": 20.68058967590332, "learning_rate": 2.209532170367014e-05, "loss": 2.4645, "step": 2670500 }, { "epoch": 5.591234886291322, "grad_norm": 17.15547752380371, "learning_rate": 2.209007744717985e-05, "loss": 2.45, "step": 2671000 }, { "epoch": 5.592281542016948, "grad_norm": 16.334707260131836, "learning_rate": 2.208483319068956e-05, "loss": 2.4848, "step": 2671500 }, { "epoch": 5.593328197742573, "grad_norm": 14.083610534667969, "learning_rate": 2.2079588934199266e-05, "loss": 2.4792, "step": 2672000 }, { "epoch": 5.5943748534681985, "grad_norm": 17.13959312438965, "learning_rate": 2.2074344677708974e-05, "loss": 2.4667, "step": 2672500 }, { "epoch": 5.595421509193824, "grad_norm": 17.8581485748291, "learning_rate": 2.2069100421218682e-05, "loss": 2.4917, "step": 2673000 }, { "epoch": 5.596468164919449, "grad_norm": 15.537131309509277, "learning_rate": 2.2063856164728393e-05, "loss": 2.4498, "step": 2673500 }, { "epoch": 5.597514820645075, "grad_norm": 14.23122501373291, "learning_rate": 2.20586119082381e-05, "loss": 2.455, "step": 2674000 }, { "epoch": 5.5985614763707, "grad_norm": 14.68184757232666, "learning_rate": 2.2053367651747805e-05, "loss": 2.4423, "step": 2674500 }, { "epoch": 5.599608132096326, "grad_norm": 15.283702850341797, "learning_rate": 2.2048123395257513e-05, "loss": 2.4833, "step": 2675000 }, { "epoch": 5.600654787821951, "grad_norm": 16.620450973510742, "learning_rate": 2.2042879138767224e-05, "loss": 2.4699, "step": 2675500 }, { "epoch": 5.601701443547577, "grad_norm": 17.828876495361328, "learning_rate": 2.203763488227693e-05, "loss": 2.4704, "step": 2676000 }, { "epoch": 5.602748099273202, "grad_norm": 22.114849090576172, "learning_rate": 2.203239062578664e-05, "loss": 2.4738, "step": 2676500 }, { "epoch": 5.603794754998828, "grad_norm": 16.2982120513916, "learning_rate": 2.2027146369296347e-05, "loss": 2.4477, "step": 2677000 }, { "epoch": 5.604841410724453, "grad_norm": 16.778207778930664, "learning_rate": 2.2021902112806054e-05, "loss": 2.4721, "step": 2677500 }, { "epoch": 5.6058880664500785, "grad_norm": 15.447460174560547, "learning_rate": 2.2016657856315765e-05, "loss": 2.4696, "step": 2678000 }, { "epoch": 5.606934722175704, "grad_norm": 14.838472366333008, "learning_rate": 2.2011413599825473e-05, "loss": 2.4534, "step": 2678500 }, { "epoch": 5.6079813779013294, "grad_norm": 14.711958885192871, "learning_rate": 2.200616934333518e-05, "loss": 2.4896, "step": 2679000 }, { "epoch": 5.609028033626955, "grad_norm": 16.22727394104004, "learning_rate": 2.200092508684489e-05, "loss": 2.463, "step": 2679500 }, { "epoch": 5.61007468935258, "grad_norm": 14.305465698242188, "learning_rate": 2.19956808303546e-05, "loss": 2.4629, "step": 2680000 }, { "epoch": 5.611121345078206, "grad_norm": 19.23126983642578, "learning_rate": 2.1990436573864304e-05, "loss": 2.4635, "step": 2680500 }, { "epoch": 5.612168000803831, "grad_norm": 18.091014862060547, "learning_rate": 2.198519231737401e-05, "loss": 2.4572, "step": 2681000 }, { "epoch": 5.613214656529457, "grad_norm": 13.290536880493164, "learning_rate": 2.197994806088372e-05, "loss": 2.4633, "step": 2681500 }, { "epoch": 5.614261312255082, "grad_norm": 17.925559997558594, "learning_rate": 2.197470380439343e-05, "loss": 2.4613, "step": 2682000 }, { "epoch": 5.6153079679807085, "grad_norm": 14.956313133239746, "learning_rate": 2.1969459547903138e-05, "loss": 2.4599, "step": 2682500 }, { "epoch": 5.616354623706334, "grad_norm": 16.37174415588379, "learning_rate": 2.1964215291412846e-05, "loss": 2.4466, "step": 2683000 }, { "epoch": 5.617401279431959, "grad_norm": 15.628686904907227, "learning_rate": 2.1958971034922553e-05, "loss": 2.4724, "step": 2683500 }, { "epoch": 5.618447935157585, "grad_norm": 14.186441421508789, "learning_rate": 2.195372677843226e-05, "loss": 2.4552, "step": 2684000 }, { "epoch": 5.61949459088321, "grad_norm": 16.651891708374023, "learning_rate": 2.1948482521941972e-05, "loss": 2.4632, "step": 2684500 }, { "epoch": 5.620541246608836, "grad_norm": 16.794002532958984, "learning_rate": 2.194323826545168e-05, "loss": 2.4646, "step": 2685000 }, { "epoch": 5.621587902334461, "grad_norm": 13.951940536499023, "learning_rate": 2.1937994008961388e-05, "loss": 2.4587, "step": 2685500 }, { "epoch": 5.622634558060087, "grad_norm": 16.020950317382812, "learning_rate": 2.1932749752471092e-05, "loss": 2.4801, "step": 2686000 }, { "epoch": 5.623681213785712, "grad_norm": 17.073434829711914, "learning_rate": 2.1927505495980803e-05, "loss": 2.4469, "step": 2686500 }, { "epoch": 5.624727869511338, "grad_norm": 16.152788162231445, "learning_rate": 2.192226123949051e-05, "loss": 2.458, "step": 2687000 }, { "epoch": 5.625774525236963, "grad_norm": 16.508075714111328, "learning_rate": 2.191701698300022e-05, "loss": 2.4666, "step": 2687500 }, { "epoch": 5.6268211809625885, "grad_norm": 15.014933586120605, "learning_rate": 2.1911772726509926e-05, "loss": 2.4833, "step": 2688000 }, { "epoch": 5.627867836688214, "grad_norm": 13.447696685791016, "learning_rate": 2.1906528470019637e-05, "loss": 2.4565, "step": 2688500 }, { "epoch": 5.6289144924138395, "grad_norm": 14.67487907409668, "learning_rate": 2.1901284213529345e-05, "loss": 2.4674, "step": 2689000 }, { "epoch": 5.629961148139465, "grad_norm": 18.111186981201172, "learning_rate": 2.1896039957039053e-05, "loss": 2.4425, "step": 2689500 }, { "epoch": 5.63100780386509, "grad_norm": 14.389999389648438, "learning_rate": 2.189079570054876e-05, "loss": 2.4681, "step": 2690000 }, { "epoch": 5.632054459590716, "grad_norm": 16.07379913330078, "learning_rate": 2.1885551444058468e-05, "loss": 2.4768, "step": 2690500 }, { "epoch": 5.633101115316341, "grad_norm": 17.073749542236328, "learning_rate": 2.188030718756818e-05, "loss": 2.4646, "step": 2691000 }, { "epoch": 5.634147771041967, "grad_norm": 15.342644691467285, "learning_rate": 2.1875062931077883e-05, "loss": 2.436, "step": 2691500 }, { "epoch": 5.635194426767592, "grad_norm": 16.06455421447754, "learning_rate": 2.186981867458759e-05, "loss": 2.4554, "step": 2692000 }, { "epoch": 5.636241082493218, "grad_norm": 17.368118286132812, "learning_rate": 2.18645744180973e-05, "loss": 2.4781, "step": 2692500 }, { "epoch": 5.637287738218843, "grad_norm": 17.48084259033203, "learning_rate": 2.185933016160701e-05, "loss": 2.4585, "step": 2693000 }, { "epoch": 5.638334393944469, "grad_norm": 15.562224388122559, "learning_rate": 2.1854085905116717e-05, "loss": 2.4612, "step": 2693500 }, { "epoch": 5.639381049670094, "grad_norm": 13.813969612121582, "learning_rate": 2.1848841648626425e-05, "loss": 2.4393, "step": 2694000 }, { "epoch": 5.6404277053957195, "grad_norm": 16.005979537963867, "learning_rate": 2.1843597392136133e-05, "loss": 2.4578, "step": 2694500 }, { "epoch": 5.641474361121345, "grad_norm": 17.158342361450195, "learning_rate": 2.183835313564584e-05, "loss": 2.4575, "step": 2695000 }, { "epoch": 5.64252101684697, "grad_norm": 15.099015235900879, "learning_rate": 2.183310887915555e-05, "loss": 2.4581, "step": 2695500 }, { "epoch": 5.643567672572596, "grad_norm": 16.08185386657715, "learning_rate": 2.182786462266526e-05, "loss": 2.4688, "step": 2696000 }, { "epoch": 5.644614328298221, "grad_norm": 17.435440063476562, "learning_rate": 2.1822620366174967e-05, "loss": 2.4615, "step": 2696500 }, { "epoch": 5.645660984023847, "grad_norm": 16.36138916015625, "learning_rate": 2.1817376109684675e-05, "loss": 2.4665, "step": 2697000 }, { "epoch": 5.646707639749472, "grad_norm": 14.462323188781738, "learning_rate": 2.1812131853194382e-05, "loss": 2.461, "step": 2697500 }, { "epoch": 5.647754295475098, "grad_norm": 17.167171478271484, "learning_rate": 2.180688759670409e-05, "loss": 2.4654, "step": 2698000 }, { "epoch": 5.648800951200723, "grad_norm": 19.260347366333008, "learning_rate": 2.1801643340213798e-05, "loss": 2.478, "step": 2698500 }, { "epoch": 5.649847606926349, "grad_norm": 16.87256622314453, "learning_rate": 2.1796399083723505e-05, "loss": 2.4649, "step": 2699000 }, { "epoch": 5.650894262651974, "grad_norm": 16.482797622680664, "learning_rate": 2.1791154827233217e-05, "loss": 2.4646, "step": 2699500 }, { "epoch": 5.6519409183775995, "grad_norm": 16.468990325927734, "learning_rate": 2.1785910570742924e-05, "loss": 2.457, "step": 2700000 }, { "epoch": 5.652987574103225, "grad_norm": 16.48467445373535, "learning_rate": 2.1780666314252632e-05, "loss": 2.4484, "step": 2700500 }, { "epoch": 5.65403422982885, "grad_norm": 16.49082374572754, "learning_rate": 2.177542205776234e-05, "loss": 2.4686, "step": 2701000 }, { "epoch": 5.655080885554476, "grad_norm": 14.574026107788086, "learning_rate": 2.1770177801272047e-05, "loss": 2.4748, "step": 2701500 }, { "epoch": 5.656127541280102, "grad_norm": 12.768967628479004, "learning_rate": 2.176493354478176e-05, "loss": 2.4531, "step": 2702000 }, { "epoch": 5.657174197005728, "grad_norm": 14.955398559570312, "learning_rate": 2.1759689288291466e-05, "loss": 2.4536, "step": 2702500 }, { "epoch": 5.658220852731353, "grad_norm": 13.72415542602539, "learning_rate": 2.175444503180117e-05, "loss": 2.4668, "step": 2703000 }, { "epoch": 5.659267508456979, "grad_norm": 16.625652313232422, "learning_rate": 2.1749200775310878e-05, "loss": 2.4707, "step": 2703500 }, { "epoch": 5.660314164182604, "grad_norm": 17.1499080657959, "learning_rate": 2.174395651882059e-05, "loss": 2.4593, "step": 2704000 }, { "epoch": 5.6613608199082295, "grad_norm": 17.218780517578125, "learning_rate": 2.1738712262330297e-05, "loss": 2.4872, "step": 2704500 }, { "epoch": 5.662407475633855, "grad_norm": 15.247515678405762, "learning_rate": 2.1733468005840005e-05, "loss": 2.4651, "step": 2705000 }, { "epoch": 5.66345413135948, "grad_norm": 15.656389236450195, "learning_rate": 2.1728223749349712e-05, "loss": 2.4676, "step": 2705500 }, { "epoch": 5.664500787085106, "grad_norm": 15.084992408752441, "learning_rate": 2.172297949285942e-05, "loss": 2.4561, "step": 2706000 }, { "epoch": 5.665547442810731, "grad_norm": 15.878186225891113, "learning_rate": 2.171773523636913e-05, "loss": 2.4605, "step": 2706500 }, { "epoch": 5.666594098536357, "grad_norm": 20.50290870666504, "learning_rate": 2.171249097987884e-05, "loss": 2.4565, "step": 2707000 }, { "epoch": 5.667640754261982, "grad_norm": 18.267581939697266, "learning_rate": 2.1707246723388546e-05, "loss": 2.4495, "step": 2707500 }, { "epoch": 5.668687409987608, "grad_norm": 17.233322143554688, "learning_rate": 2.1702002466898254e-05, "loss": 2.4598, "step": 2708000 }, { "epoch": 5.669734065713233, "grad_norm": 19.371309280395508, "learning_rate": 2.1696758210407962e-05, "loss": 2.4549, "step": 2708500 }, { "epoch": 5.670780721438859, "grad_norm": 17.780155181884766, "learning_rate": 2.169151395391767e-05, "loss": 2.457, "step": 2709000 }, { "epoch": 5.671827377164484, "grad_norm": 15.617731094360352, "learning_rate": 2.1686269697427377e-05, "loss": 2.4589, "step": 2709500 }, { "epoch": 5.6728740328901095, "grad_norm": 17.373376846313477, "learning_rate": 2.1681025440937085e-05, "loss": 2.4526, "step": 2710000 }, { "epoch": 5.673920688615735, "grad_norm": 16.868480682373047, "learning_rate": 2.1675781184446796e-05, "loss": 2.4506, "step": 2710500 }, { "epoch": 5.67496734434136, "grad_norm": 15.715373039245605, "learning_rate": 2.1670536927956504e-05, "loss": 2.4695, "step": 2711000 }, { "epoch": 5.676014000066986, "grad_norm": 15.412042617797852, "learning_rate": 2.166529267146621e-05, "loss": 2.4638, "step": 2711500 }, { "epoch": 5.677060655792611, "grad_norm": 15.690680503845215, "learning_rate": 2.166004841497592e-05, "loss": 2.4706, "step": 2712000 }, { "epoch": 5.678107311518237, "grad_norm": 16.486833572387695, "learning_rate": 2.1654804158485627e-05, "loss": 2.4503, "step": 2712500 }, { "epoch": 5.679153967243862, "grad_norm": 19.555261611938477, "learning_rate": 2.1649559901995338e-05, "loss": 2.4582, "step": 2713000 }, { "epoch": 5.680200622969488, "grad_norm": 14.706911087036133, "learning_rate": 2.1644315645505045e-05, "loss": 2.4372, "step": 2713500 }, { "epoch": 5.681247278695113, "grad_norm": 17.47011947631836, "learning_rate": 2.1639071389014753e-05, "loss": 2.4436, "step": 2714000 }, { "epoch": 5.682293934420739, "grad_norm": 20.53410530090332, "learning_rate": 2.1633827132524457e-05, "loss": 2.4427, "step": 2714500 }, { "epoch": 5.683340590146364, "grad_norm": 16.89705467224121, "learning_rate": 2.162858287603417e-05, "loss": 2.4619, "step": 2715000 }, { "epoch": 5.6843872458719895, "grad_norm": 19.311532974243164, "learning_rate": 2.1623338619543876e-05, "loss": 2.4703, "step": 2715500 }, { "epoch": 5.685433901597615, "grad_norm": 13.24427604675293, "learning_rate": 2.1618094363053584e-05, "loss": 2.4627, "step": 2716000 }, { "epoch": 5.68648055732324, "grad_norm": 16.38771629333496, "learning_rate": 2.161285010656329e-05, "loss": 2.4619, "step": 2716500 }, { "epoch": 5.687527213048866, "grad_norm": 15.59527587890625, "learning_rate": 2.1607605850073003e-05, "loss": 2.4605, "step": 2717000 }, { "epoch": 5.688573868774492, "grad_norm": 16.205228805541992, "learning_rate": 2.160236159358271e-05, "loss": 2.4552, "step": 2717500 }, { "epoch": 5.689620524500118, "grad_norm": 14.615553855895996, "learning_rate": 2.1597117337092418e-05, "loss": 2.4659, "step": 2718000 }, { "epoch": 5.690667180225743, "grad_norm": 15.238909721374512, "learning_rate": 2.1591873080602126e-05, "loss": 2.4448, "step": 2718500 }, { "epoch": 5.691713835951369, "grad_norm": 17.145915985107422, "learning_rate": 2.1586628824111833e-05, "loss": 2.4493, "step": 2719000 }, { "epoch": 5.692760491676994, "grad_norm": 17.204601287841797, "learning_rate": 2.1581384567621544e-05, "loss": 2.4537, "step": 2719500 }, { "epoch": 5.6938071474026195, "grad_norm": 16.710159301757812, "learning_rate": 2.157614031113125e-05, "loss": 2.4515, "step": 2720000 }, { "epoch": 5.694853803128245, "grad_norm": 13.885233879089355, "learning_rate": 2.1570896054640956e-05, "loss": 2.4527, "step": 2720500 }, { "epoch": 5.69590045885387, "grad_norm": 15.632088661193848, "learning_rate": 2.1565651798150664e-05, "loss": 2.4397, "step": 2721000 }, { "epoch": 5.696947114579496, "grad_norm": 15.153646469116211, "learning_rate": 2.1560407541660375e-05, "loss": 2.4418, "step": 2721500 }, { "epoch": 5.697993770305121, "grad_norm": 15.596002578735352, "learning_rate": 2.1555163285170083e-05, "loss": 2.4642, "step": 2722000 }, { "epoch": 5.699040426030747, "grad_norm": 14.402970314025879, "learning_rate": 2.154991902867979e-05, "loss": 2.4586, "step": 2722500 }, { "epoch": 5.700087081756372, "grad_norm": 16.74958038330078, "learning_rate": 2.15446747721895e-05, "loss": 2.4492, "step": 2723000 }, { "epoch": 5.701133737481998, "grad_norm": 14.381986618041992, "learning_rate": 2.1539430515699206e-05, "loss": 2.4578, "step": 2723500 }, { "epoch": 5.702180393207623, "grad_norm": 15.656999588012695, "learning_rate": 2.1534186259208917e-05, "loss": 2.4528, "step": 2724000 }, { "epoch": 5.703227048933249, "grad_norm": 17.23037338256836, "learning_rate": 2.1528942002718625e-05, "loss": 2.4674, "step": 2724500 }, { "epoch": 5.704273704658874, "grad_norm": 15.152880668640137, "learning_rate": 2.1523697746228332e-05, "loss": 2.4639, "step": 2725000 }, { "epoch": 5.7053203603844995, "grad_norm": 16.196622848510742, "learning_rate": 2.151845348973804e-05, "loss": 2.4596, "step": 2725500 }, { "epoch": 5.706367016110125, "grad_norm": 16.273439407348633, "learning_rate": 2.1513209233247748e-05, "loss": 2.4503, "step": 2726000 }, { "epoch": 5.70741367183575, "grad_norm": 14.065092086791992, "learning_rate": 2.1507964976757456e-05, "loss": 2.4578, "step": 2726500 }, { "epoch": 5.708460327561376, "grad_norm": 12.553817749023438, "learning_rate": 2.1502720720267163e-05, "loss": 2.4753, "step": 2727000 }, { "epoch": 5.709506983287001, "grad_norm": 22.329082489013672, "learning_rate": 2.149747646377687e-05, "loss": 2.4722, "step": 2727500 }, { "epoch": 5.710553639012627, "grad_norm": 13.68724536895752, "learning_rate": 2.1492232207286582e-05, "loss": 2.4556, "step": 2728000 }, { "epoch": 5.711600294738252, "grad_norm": 16.995847702026367, "learning_rate": 2.148698795079629e-05, "loss": 2.4418, "step": 2728500 }, { "epoch": 5.712646950463878, "grad_norm": 17.262775421142578, "learning_rate": 2.1481743694305997e-05, "loss": 2.4405, "step": 2729000 }, { "epoch": 5.713693606189503, "grad_norm": 17.102617263793945, "learning_rate": 2.1476499437815705e-05, "loss": 2.455, "step": 2729500 }, { "epoch": 5.714740261915129, "grad_norm": 17.954315185546875, "learning_rate": 2.1471255181325413e-05, "loss": 2.4749, "step": 2730000 }, { "epoch": 5.715786917640754, "grad_norm": 16.57422637939453, "learning_rate": 2.1466010924835124e-05, "loss": 2.4674, "step": 2730500 }, { "epoch": 5.7168335733663795, "grad_norm": 15.387385368347168, "learning_rate": 2.146076666834483e-05, "loss": 2.4692, "step": 2731000 }, { "epoch": 5.717880229092005, "grad_norm": 17.09117889404297, "learning_rate": 2.1455522411854536e-05, "loss": 2.4666, "step": 2731500 }, { "epoch": 5.7189268848176305, "grad_norm": 16.22782325744629, "learning_rate": 2.1450278155364244e-05, "loss": 2.456, "step": 2732000 }, { "epoch": 5.719973540543256, "grad_norm": 15.572466850280762, "learning_rate": 2.1445033898873955e-05, "loss": 2.4538, "step": 2732500 }, { "epoch": 5.721020196268881, "grad_norm": 15.128133773803711, "learning_rate": 2.1439789642383662e-05, "loss": 2.459, "step": 2733000 }, { "epoch": 5.722066851994507, "grad_norm": 17.897794723510742, "learning_rate": 2.143454538589337e-05, "loss": 2.4642, "step": 2733500 }, { "epoch": 5.723113507720132, "grad_norm": 18.995784759521484, "learning_rate": 2.1429301129403078e-05, "loss": 2.4586, "step": 2734000 }, { "epoch": 5.724160163445758, "grad_norm": 15.923019409179688, "learning_rate": 2.142405687291279e-05, "loss": 2.4521, "step": 2734500 }, { "epoch": 5.725206819171383, "grad_norm": 15.358650207519531, "learning_rate": 2.1418812616422496e-05, "loss": 2.4596, "step": 2735000 }, { "epoch": 5.726253474897009, "grad_norm": 21.86969757080078, "learning_rate": 2.1413568359932204e-05, "loss": 2.457, "step": 2735500 }, { "epoch": 5.727300130622634, "grad_norm": 16.322429656982422, "learning_rate": 2.1408324103441912e-05, "loss": 2.4504, "step": 2736000 }, { "epoch": 5.72834678634826, "grad_norm": 16.757320404052734, "learning_rate": 2.140307984695162e-05, "loss": 2.4495, "step": 2736500 }, { "epoch": 5.729393442073886, "grad_norm": 16.780820846557617, "learning_rate": 2.1397835590461327e-05, "loss": 2.4672, "step": 2737000 }, { "epoch": 5.730440097799511, "grad_norm": 16.996477127075195, "learning_rate": 2.1392591333971035e-05, "loss": 2.4772, "step": 2737500 }, { "epoch": 5.731486753525137, "grad_norm": 19.25312614440918, "learning_rate": 2.1387347077480743e-05, "loss": 2.4673, "step": 2738000 }, { "epoch": 5.732533409250762, "grad_norm": 18.059804916381836, "learning_rate": 2.138210282099045e-05, "loss": 2.4515, "step": 2738500 }, { "epoch": 5.733580064976388, "grad_norm": 16.684574127197266, "learning_rate": 2.137685856450016e-05, "loss": 2.466, "step": 2739000 }, { "epoch": 5.734626720702013, "grad_norm": 18.319705963134766, "learning_rate": 2.137161430800987e-05, "loss": 2.4677, "step": 2739500 }, { "epoch": 5.735673376427639, "grad_norm": 13.501473426818848, "learning_rate": 2.1366370051519577e-05, "loss": 2.4738, "step": 2740000 }, { "epoch": 5.736720032153264, "grad_norm": 16.87696647644043, "learning_rate": 2.1361125795029284e-05, "loss": 2.4587, "step": 2740500 }, { "epoch": 5.7377666878788895, "grad_norm": 14.000175476074219, "learning_rate": 2.1355881538538992e-05, "loss": 2.4663, "step": 2741000 }, { "epoch": 5.738813343604515, "grad_norm": 19.926589965820312, "learning_rate": 2.1350637282048703e-05, "loss": 2.4777, "step": 2741500 }, { "epoch": 5.7398599993301405, "grad_norm": 18.18353843688965, "learning_rate": 2.134539302555841e-05, "loss": 2.4625, "step": 2742000 }, { "epoch": 5.740906655055766, "grad_norm": 16.151811599731445, "learning_rate": 2.134014876906812e-05, "loss": 2.462, "step": 2742500 }, { "epoch": 5.741953310781391, "grad_norm": 15.010868072509766, "learning_rate": 2.1334904512577826e-05, "loss": 2.4439, "step": 2743000 }, { "epoch": 5.742999966507017, "grad_norm": 16.90671730041504, "learning_rate": 2.1329660256087534e-05, "loss": 2.4592, "step": 2743500 }, { "epoch": 5.744046622232642, "grad_norm": 17.792762756347656, "learning_rate": 2.1324415999597242e-05, "loss": 2.472, "step": 2744000 }, { "epoch": 5.745093277958268, "grad_norm": 17.969573974609375, "learning_rate": 2.131917174310695e-05, "loss": 2.4583, "step": 2744500 }, { "epoch": 5.746139933683893, "grad_norm": 16.21164321899414, "learning_rate": 2.1313927486616657e-05, "loss": 2.4447, "step": 2745000 }, { "epoch": 5.747186589409519, "grad_norm": 17.5384464263916, "learning_rate": 2.1308683230126368e-05, "loss": 2.454, "step": 2745500 }, { "epoch": 5.748233245135144, "grad_norm": 17.22732925415039, "learning_rate": 2.1303438973636076e-05, "loss": 2.4646, "step": 2746000 }, { "epoch": 5.74927990086077, "grad_norm": 21.98215103149414, "learning_rate": 2.1298194717145784e-05, "loss": 2.4654, "step": 2746500 }, { "epoch": 5.750326556586395, "grad_norm": 14.30354118347168, "learning_rate": 2.129295046065549e-05, "loss": 2.4533, "step": 2747000 }, { "epoch": 5.7513732123120205, "grad_norm": 14.509764671325684, "learning_rate": 2.12877062041652e-05, "loss": 2.4708, "step": 2747500 }, { "epoch": 5.752419868037646, "grad_norm": 13.309767723083496, "learning_rate": 2.128246194767491e-05, "loss": 2.4645, "step": 2748000 }, { "epoch": 5.753466523763271, "grad_norm": 19.792213439941406, "learning_rate": 2.1277217691184614e-05, "loss": 2.4506, "step": 2748500 }, { "epoch": 5.754513179488897, "grad_norm": 18.975746154785156, "learning_rate": 2.1271973434694322e-05, "loss": 2.4593, "step": 2749000 }, { "epoch": 5.755559835214522, "grad_norm": 16.488462448120117, "learning_rate": 2.126672917820403e-05, "loss": 2.4598, "step": 2749500 }, { "epoch": 5.756606490940148, "grad_norm": 16.615949630737305, "learning_rate": 2.126148492171374e-05, "loss": 2.4719, "step": 2750000 }, { "epoch": 5.757653146665773, "grad_norm": 16.17789649963379, "learning_rate": 2.125624066522345e-05, "loss": 2.4585, "step": 2750500 }, { "epoch": 5.758699802391399, "grad_norm": 16.99569320678711, "learning_rate": 2.1250996408733156e-05, "loss": 2.4662, "step": 2751000 }, { "epoch": 5.759746458117024, "grad_norm": 16.504587173461914, "learning_rate": 2.1245752152242864e-05, "loss": 2.4577, "step": 2751500 }, { "epoch": 5.76079311384265, "grad_norm": 18.008737564086914, "learning_rate": 2.1240507895752575e-05, "loss": 2.46, "step": 2752000 }, { "epoch": 5.761839769568276, "grad_norm": 17.386131286621094, "learning_rate": 2.1235263639262283e-05, "loss": 2.4403, "step": 2752500 }, { "epoch": 5.762886425293901, "grad_norm": 17.041439056396484, "learning_rate": 2.123001938277199e-05, "loss": 2.4516, "step": 2753000 }, { "epoch": 5.763933081019527, "grad_norm": 15.608407974243164, "learning_rate": 2.1224775126281698e-05, "loss": 2.4715, "step": 2753500 }, { "epoch": 5.764979736745152, "grad_norm": 15.986942291259766, "learning_rate": 2.1219530869791406e-05, "loss": 2.4532, "step": 2754000 }, { "epoch": 5.766026392470778, "grad_norm": 17.47457504272461, "learning_rate": 2.1214286613301113e-05, "loss": 2.467, "step": 2754500 }, { "epoch": 5.767073048196403, "grad_norm": 19.011659622192383, "learning_rate": 2.120904235681082e-05, "loss": 2.4491, "step": 2755000 }, { "epoch": 5.768119703922029, "grad_norm": 14.441469192504883, "learning_rate": 2.120379810032053e-05, "loss": 2.456, "step": 2755500 }, { "epoch": 5.769166359647654, "grad_norm": 17.8342227935791, "learning_rate": 2.1198553843830236e-05, "loss": 2.4581, "step": 2756000 }, { "epoch": 5.77021301537328, "grad_norm": 13.883576393127441, "learning_rate": 2.1193309587339948e-05, "loss": 2.4393, "step": 2756500 }, { "epoch": 5.771259671098905, "grad_norm": 21.086666107177734, "learning_rate": 2.1188065330849655e-05, "loss": 2.4566, "step": 2757000 }, { "epoch": 5.7723063268245305, "grad_norm": 16.686674118041992, "learning_rate": 2.1182821074359363e-05, "loss": 2.4404, "step": 2757500 }, { "epoch": 5.773352982550156, "grad_norm": 16.388076782226562, "learning_rate": 2.117757681786907e-05, "loss": 2.4551, "step": 2758000 }, { "epoch": 5.774399638275781, "grad_norm": 15.302160263061523, "learning_rate": 2.1172332561378778e-05, "loss": 2.4653, "step": 2758500 }, { "epoch": 5.775446294001407, "grad_norm": 17.891633987426758, "learning_rate": 2.116708830488849e-05, "loss": 2.4491, "step": 2759000 }, { "epoch": 5.776492949727032, "grad_norm": 19.614103317260742, "learning_rate": 2.1161844048398197e-05, "loss": 2.4449, "step": 2759500 }, { "epoch": 5.777539605452658, "grad_norm": 17.211706161499023, "learning_rate": 2.11565997919079e-05, "loss": 2.4632, "step": 2760000 }, { "epoch": 5.778586261178283, "grad_norm": 15.509736061096191, "learning_rate": 2.115135553541761e-05, "loss": 2.4643, "step": 2760500 }, { "epoch": 5.779632916903909, "grad_norm": 16.384721755981445, "learning_rate": 2.114611127892732e-05, "loss": 2.4312, "step": 2761000 }, { "epoch": 5.780679572629534, "grad_norm": 17.351722717285156, "learning_rate": 2.1140867022437028e-05, "loss": 2.4464, "step": 2761500 }, { "epoch": 5.78172622835516, "grad_norm": 19.493200302124023, "learning_rate": 2.1135622765946736e-05, "loss": 2.4621, "step": 2762000 }, { "epoch": 5.782772884080785, "grad_norm": 14.00461196899414, "learning_rate": 2.1130378509456443e-05, "loss": 2.446, "step": 2762500 }, { "epoch": 5.7838195398064105, "grad_norm": 17.291461944580078, "learning_rate": 2.1125134252966154e-05, "loss": 2.4673, "step": 2763000 }, { "epoch": 5.784866195532036, "grad_norm": 16.907394409179688, "learning_rate": 2.1119889996475862e-05, "loss": 2.4458, "step": 2763500 }, { "epoch": 5.785912851257661, "grad_norm": 16.873706817626953, "learning_rate": 2.111464573998557e-05, "loss": 2.4519, "step": 2764000 }, { "epoch": 5.786959506983287, "grad_norm": 17.033313751220703, "learning_rate": 2.1109401483495277e-05, "loss": 2.4739, "step": 2764500 }, { "epoch": 5.788006162708912, "grad_norm": 18.62765884399414, "learning_rate": 2.1104157227004985e-05, "loss": 2.4764, "step": 2765000 }, { "epoch": 5.789052818434538, "grad_norm": 16.501583099365234, "learning_rate": 2.1098912970514693e-05, "loss": 2.4637, "step": 2765500 }, { "epoch": 5.790099474160163, "grad_norm": 18.43709373474121, "learning_rate": 2.10936687140244e-05, "loss": 2.428, "step": 2766000 }, { "epoch": 5.791146129885789, "grad_norm": 14.014009475708008, "learning_rate": 2.1088424457534108e-05, "loss": 2.4547, "step": 2766500 }, { "epoch": 5.792192785611414, "grad_norm": 14.036188125610352, "learning_rate": 2.1083180201043816e-05, "loss": 2.4569, "step": 2767000 }, { "epoch": 5.79323944133704, "grad_norm": 13.209744453430176, "learning_rate": 2.1077935944553527e-05, "loss": 2.4634, "step": 2767500 }, { "epoch": 5.794286097062665, "grad_norm": 13.788847923278809, "learning_rate": 2.1072691688063235e-05, "loss": 2.4618, "step": 2768000 }, { "epoch": 5.7953327527882905, "grad_norm": 15.698433876037598, "learning_rate": 2.1067447431572942e-05, "loss": 2.4625, "step": 2768500 }, { "epoch": 5.796379408513916, "grad_norm": 17.874589920043945, "learning_rate": 2.106220317508265e-05, "loss": 2.4769, "step": 2769000 }, { "epoch": 5.797426064239541, "grad_norm": 15.349862098693848, "learning_rate": 2.105695891859236e-05, "loss": 2.4626, "step": 2769500 }, { "epoch": 5.798472719965167, "grad_norm": 16.01974105834961, "learning_rate": 2.105171466210207e-05, "loss": 2.4629, "step": 2770000 }, { "epoch": 5.799519375690792, "grad_norm": 15.780619621276855, "learning_rate": 2.1046470405611776e-05, "loss": 2.4394, "step": 2770500 }, { "epoch": 5.800566031416418, "grad_norm": 15.836479187011719, "learning_rate": 2.104122614912148e-05, "loss": 2.4661, "step": 2771000 }, { "epoch": 5.801612687142043, "grad_norm": 15.947572708129883, "learning_rate": 2.1035981892631192e-05, "loss": 2.4573, "step": 2771500 }, { "epoch": 5.80265934286767, "grad_norm": 17.011831283569336, "learning_rate": 2.10307376361409e-05, "loss": 2.4506, "step": 2772000 }, { "epoch": 5.803705998593295, "grad_norm": 15.297348022460938, "learning_rate": 2.1025493379650607e-05, "loss": 2.4533, "step": 2772500 }, { "epoch": 5.8047526543189205, "grad_norm": 17.051374435424805, "learning_rate": 2.1020249123160315e-05, "loss": 2.4592, "step": 2773000 }, { "epoch": 5.805799310044546, "grad_norm": 15.984115600585938, "learning_rate": 2.1015004866670023e-05, "loss": 2.4654, "step": 2773500 }, { "epoch": 5.806845965770171, "grad_norm": 22.42622184753418, "learning_rate": 2.1009760610179734e-05, "loss": 2.4384, "step": 2774000 }, { "epoch": 5.807892621495797, "grad_norm": 18.308319091796875, "learning_rate": 2.100451635368944e-05, "loss": 2.4587, "step": 2774500 }, { "epoch": 5.808939277221422, "grad_norm": 15.021458625793457, "learning_rate": 2.099927209719915e-05, "loss": 2.4647, "step": 2775000 }, { "epoch": 5.809985932947048, "grad_norm": 15.22667121887207, "learning_rate": 2.0994027840708857e-05, "loss": 2.4424, "step": 2775500 }, { "epoch": 5.811032588672673, "grad_norm": 17.24907875061035, "learning_rate": 2.0988783584218564e-05, "loss": 2.4635, "step": 2776000 }, { "epoch": 5.812079244398299, "grad_norm": 16.36125373840332, "learning_rate": 2.0983539327728276e-05, "loss": 2.4581, "step": 2776500 }, { "epoch": 5.813125900123924, "grad_norm": 20.980449676513672, "learning_rate": 2.097829507123798e-05, "loss": 2.4607, "step": 2777000 }, { "epoch": 5.81417255584955, "grad_norm": 19.3668155670166, "learning_rate": 2.0973050814747688e-05, "loss": 2.4493, "step": 2777500 }, { "epoch": 5.815219211575175, "grad_norm": 16.776691436767578, "learning_rate": 2.0967806558257395e-05, "loss": 2.4561, "step": 2778000 }, { "epoch": 5.8162658673008005, "grad_norm": 18.82878875732422, "learning_rate": 2.0962562301767106e-05, "loss": 2.4486, "step": 2778500 }, { "epoch": 5.817312523026426, "grad_norm": 15.910736083984375, "learning_rate": 2.0957318045276814e-05, "loss": 2.4647, "step": 2779000 }, { "epoch": 5.818359178752051, "grad_norm": 18.022497177124023, "learning_rate": 2.095207378878652e-05, "loss": 2.4396, "step": 2779500 }, { "epoch": 5.819405834477677, "grad_norm": 14.954204559326172, "learning_rate": 2.094682953229623e-05, "loss": 2.4612, "step": 2780000 }, { "epoch": 5.820452490203302, "grad_norm": 15.971946716308594, "learning_rate": 2.094158527580594e-05, "loss": 2.4681, "step": 2780500 }, { "epoch": 5.821499145928928, "grad_norm": 17.8261775970459, "learning_rate": 2.0936341019315648e-05, "loss": 2.4696, "step": 2781000 }, { "epoch": 5.822545801654553, "grad_norm": 14.599653244018555, "learning_rate": 2.0931096762825356e-05, "loss": 2.4426, "step": 2781500 }, { "epoch": 5.823592457380179, "grad_norm": 15.40626049041748, "learning_rate": 2.0925852506335063e-05, "loss": 2.4564, "step": 2782000 }, { "epoch": 5.824639113105804, "grad_norm": 14.44461441040039, "learning_rate": 2.092060824984477e-05, "loss": 2.452, "step": 2782500 }, { "epoch": 5.82568576883143, "grad_norm": 19.166027069091797, "learning_rate": 2.091536399335448e-05, "loss": 2.4469, "step": 2783000 }, { "epoch": 5.826732424557055, "grad_norm": 17.138683319091797, "learning_rate": 2.0910119736864187e-05, "loss": 2.4554, "step": 2783500 }, { "epoch": 5.8277790802826805, "grad_norm": 17.047597885131836, "learning_rate": 2.0904875480373894e-05, "loss": 2.4646, "step": 2784000 }, { "epoch": 5.828825736008306, "grad_norm": 17.231342315673828, "learning_rate": 2.0899631223883602e-05, "loss": 2.4646, "step": 2784500 }, { "epoch": 5.8298723917339315, "grad_norm": 39.605342864990234, "learning_rate": 2.0894386967393313e-05, "loss": 2.4778, "step": 2785000 }, { "epoch": 5.830919047459557, "grad_norm": 24.700735092163086, "learning_rate": 2.088914271090302e-05, "loss": 2.4543, "step": 2785500 }, { "epoch": 5.831965703185182, "grad_norm": 16.043230056762695, "learning_rate": 2.088389845441273e-05, "loss": 2.4313, "step": 2786000 }, { "epoch": 5.833012358910808, "grad_norm": 14.366756439208984, "learning_rate": 2.0878654197922436e-05, "loss": 2.4485, "step": 2786500 }, { "epoch": 5.834059014636433, "grad_norm": 16.726207733154297, "learning_rate": 2.0873409941432144e-05, "loss": 2.4418, "step": 2787000 }, { "epoch": 5.83510567036206, "grad_norm": 16.626447677612305, "learning_rate": 2.0868165684941855e-05, "loss": 2.4427, "step": 2787500 }, { "epoch": 5.836152326087685, "grad_norm": 15.321352005004883, "learning_rate": 2.086292142845156e-05, "loss": 2.457, "step": 2788000 }, { "epoch": 5.8371989818133105, "grad_norm": 15.143753051757812, "learning_rate": 2.0857677171961267e-05, "loss": 2.4352, "step": 2788500 }, { "epoch": 5.838245637538936, "grad_norm": 17.11406135559082, "learning_rate": 2.0852432915470978e-05, "loss": 2.479, "step": 2789000 }, { "epoch": 5.8392922932645615, "grad_norm": 14.983207702636719, "learning_rate": 2.0847188658980686e-05, "loss": 2.4532, "step": 2789500 }, { "epoch": 5.840338948990187, "grad_norm": 16.7989501953125, "learning_rate": 2.0841944402490393e-05, "loss": 2.446, "step": 2790000 }, { "epoch": 5.841385604715812, "grad_norm": 17.564355850219727, "learning_rate": 2.08367001460001e-05, "loss": 2.4336, "step": 2790500 }, { "epoch": 5.842432260441438, "grad_norm": 20.958444595336914, "learning_rate": 2.083145588950981e-05, "loss": 2.4717, "step": 2791000 }, { "epoch": 5.843478916167063, "grad_norm": 17.63782501220703, "learning_rate": 2.082621163301952e-05, "loss": 2.4456, "step": 2791500 }, { "epoch": 5.844525571892689, "grad_norm": 16.746898651123047, "learning_rate": 2.0820967376529227e-05, "loss": 2.4501, "step": 2792000 }, { "epoch": 5.845572227618314, "grad_norm": 18.40171241760254, "learning_rate": 2.0815723120038935e-05, "loss": 2.4421, "step": 2792500 }, { "epoch": 5.84661888334394, "grad_norm": 15.123747825622559, "learning_rate": 2.0810478863548643e-05, "loss": 2.4378, "step": 2793000 }, { "epoch": 5.847665539069565, "grad_norm": 15.520947456359863, "learning_rate": 2.080523460705835e-05, "loss": 2.4366, "step": 2793500 }, { "epoch": 5.8487121947951906, "grad_norm": 17.397918701171875, "learning_rate": 2.0799990350568058e-05, "loss": 2.4613, "step": 2794000 }, { "epoch": 5.849758850520816, "grad_norm": 16.813325881958008, "learning_rate": 2.0794746094077766e-05, "loss": 2.4548, "step": 2794500 }, { "epoch": 5.8508055062464415, "grad_norm": 16.833799362182617, "learning_rate": 2.0789501837587474e-05, "loss": 2.4549, "step": 2795000 }, { "epoch": 5.851852161972067, "grad_norm": 18.918346405029297, "learning_rate": 2.078425758109718e-05, "loss": 2.4367, "step": 2795500 }, { "epoch": 5.852898817697692, "grad_norm": 15.267383575439453, "learning_rate": 2.0779013324606892e-05, "loss": 2.4546, "step": 2796000 }, { "epoch": 5.853945473423318, "grad_norm": 18.322750091552734, "learning_rate": 2.07737690681166e-05, "loss": 2.4465, "step": 2796500 }, { "epoch": 5.854992129148943, "grad_norm": 17.5872745513916, "learning_rate": 2.0768524811626308e-05, "loss": 2.4329, "step": 2797000 }, { "epoch": 5.856038784874569, "grad_norm": 15.872830390930176, "learning_rate": 2.0763280555136015e-05, "loss": 2.4428, "step": 2797500 }, { "epoch": 5.857085440600194, "grad_norm": 18.298946380615234, "learning_rate": 2.0758036298645727e-05, "loss": 2.4716, "step": 2798000 }, { "epoch": 5.85813209632582, "grad_norm": 20.365947723388672, "learning_rate": 2.0752792042155434e-05, "loss": 2.44, "step": 2798500 }, { "epoch": 5.859178752051445, "grad_norm": 14.400742530822754, "learning_rate": 2.0747547785665142e-05, "loss": 2.4735, "step": 2799000 }, { "epoch": 5.860225407777071, "grad_norm": 15.563483238220215, "learning_rate": 2.0742303529174846e-05, "loss": 2.461, "step": 2799500 }, { "epoch": 5.861272063502696, "grad_norm": 16.392919540405273, "learning_rate": 2.0737059272684557e-05, "loss": 2.4616, "step": 2800000 }, { "epoch": 5.8623187192283215, "grad_norm": 13.225963592529297, "learning_rate": 2.0731815016194265e-05, "loss": 2.4688, "step": 2800500 }, { "epoch": 5.863365374953947, "grad_norm": 15.234622955322266, "learning_rate": 2.0726570759703973e-05, "loss": 2.4565, "step": 2801000 }, { "epoch": 5.864412030679572, "grad_norm": 15.533337593078613, "learning_rate": 2.072132650321368e-05, "loss": 2.4533, "step": 2801500 }, { "epoch": 5.865458686405198, "grad_norm": 14.787059783935547, "learning_rate": 2.0716082246723388e-05, "loss": 2.4546, "step": 2802000 }, { "epoch": 5.866505342130823, "grad_norm": 15.750714302062988, "learning_rate": 2.07108379902331e-05, "loss": 2.4466, "step": 2802500 }, { "epoch": 5.867551997856449, "grad_norm": 15.74460220336914, "learning_rate": 2.0705593733742807e-05, "loss": 2.441, "step": 2803000 }, { "epoch": 5.868598653582074, "grad_norm": 17.601709365844727, "learning_rate": 2.0700349477252515e-05, "loss": 2.466, "step": 2803500 }, { "epoch": 5.8696453093077, "grad_norm": 15.164833068847656, "learning_rate": 2.0695105220762222e-05, "loss": 2.4434, "step": 2804000 }, { "epoch": 5.870691965033325, "grad_norm": 18.952306747436523, "learning_rate": 2.068986096427193e-05, "loss": 2.4486, "step": 2804500 }, { "epoch": 5.871738620758951, "grad_norm": 17.201854705810547, "learning_rate": 2.0684616707781638e-05, "loss": 2.4489, "step": 2805000 }, { "epoch": 5.872785276484576, "grad_norm": 23.853782653808594, "learning_rate": 2.0679372451291345e-05, "loss": 2.4544, "step": 2805500 }, { "epoch": 5.8738319322102015, "grad_norm": 15.336296081542969, "learning_rate": 2.0674128194801053e-05, "loss": 2.4317, "step": 2806000 }, { "epoch": 5.874878587935827, "grad_norm": 13.977892875671387, "learning_rate": 2.0668883938310764e-05, "loss": 2.4609, "step": 2806500 }, { "epoch": 5.875925243661453, "grad_norm": 19.323204040527344, "learning_rate": 2.0663639681820472e-05, "loss": 2.4413, "step": 2807000 }, { "epoch": 5.876971899387079, "grad_norm": 14.352346420288086, "learning_rate": 2.065839542533018e-05, "loss": 2.4402, "step": 2807500 }, { "epoch": 5.878018555112704, "grad_norm": 16.10207748413086, "learning_rate": 2.0653151168839887e-05, "loss": 2.4458, "step": 2808000 }, { "epoch": 5.87906521083833, "grad_norm": 18.501392364501953, "learning_rate": 2.0647906912349595e-05, "loss": 2.4406, "step": 2808500 }, { "epoch": 5.880111866563955, "grad_norm": 16.270597457885742, "learning_rate": 2.0642662655859306e-05, "loss": 2.4542, "step": 2809000 }, { "epoch": 5.881158522289581, "grad_norm": 20.275861740112305, "learning_rate": 2.0637418399369014e-05, "loss": 2.4424, "step": 2809500 }, { "epoch": 5.882205178015206, "grad_norm": 16.34186553955078, "learning_rate": 2.063217414287872e-05, "loss": 2.458, "step": 2810000 }, { "epoch": 5.8832518337408315, "grad_norm": 17.146055221557617, "learning_rate": 2.062692988638843e-05, "loss": 2.4467, "step": 2810500 }, { "epoch": 5.884298489466457, "grad_norm": 19.703598022460938, "learning_rate": 2.0621685629898137e-05, "loss": 2.4534, "step": 2811000 }, { "epoch": 5.885345145192082, "grad_norm": 19.569438934326172, "learning_rate": 2.0616441373407844e-05, "loss": 2.4442, "step": 2811500 }, { "epoch": 5.886391800917708, "grad_norm": 14.699204444885254, "learning_rate": 2.0611197116917552e-05, "loss": 2.449, "step": 2812000 }, { "epoch": 5.887438456643333, "grad_norm": 16.170738220214844, "learning_rate": 2.060595286042726e-05, "loss": 2.4417, "step": 2812500 }, { "epoch": 5.888485112368959, "grad_norm": 18.781490325927734, "learning_rate": 2.0600708603936967e-05, "loss": 2.4401, "step": 2813000 }, { "epoch": 5.889531768094584, "grad_norm": 21.95343780517578, "learning_rate": 2.059546434744668e-05, "loss": 2.4533, "step": 2813500 }, { "epoch": 5.89057842382021, "grad_norm": 17.012956619262695, "learning_rate": 2.0590220090956386e-05, "loss": 2.462, "step": 2814000 }, { "epoch": 5.891625079545835, "grad_norm": 16.511947631835938, "learning_rate": 2.0584975834466094e-05, "loss": 2.4365, "step": 2814500 }, { "epoch": 5.892671735271461, "grad_norm": 15.968138694763184, "learning_rate": 2.05797315779758e-05, "loss": 2.4422, "step": 2815000 }, { "epoch": 5.893718390997086, "grad_norm": 17.414487838745117, "learning_rate": 2.0574487321485513e-05, "loss": 2.4524, "step": 2815500 }, { "epoch": 5.8947650467227115, "grad_norm": 16.570281982421875, "learning_rate": 2.056924306499522e-05, "loss": 2.4679, "step": 2816000 }, { "epoch": 5.895811702448337, "grad_norm": 17.555282592773438, "learning_rate": 2.0563998808504925e-05, "loss": 2.4521, "step": 2816500 }, { "epoch": 5.896858358173962, "grad_norm": 16.216821670532227, "learning_rate": 2.0558754552014632e-05, "loss": 2.4427, "step": 2817000 }, { "epoch": 5.897905013899588, "grad_norm": 14.895980834960938, "learning_rate": 2.0553510295524343e-05, "loss": 2.4427, "step": 2817500 }, { "epoch": 5.898951669625213, "grad_norm": 23.4232177734375, "learning_rate": 2.054826603903405e-05, "loss": 2.4504, "step": 2818000 }, { "epoch": 5.899998325350839, "grad_norm": 17.635454177856445, "learning_rate": 2.054302178254376e-05, "loss": 2.4639, "step": 2818500 }, { "epoch": 5.901044981076464, "grad_norm": 17.230466842651367, "learning_rate": 2.0537777526053467e-05, "loss": 2.4616, "step": 2819000 }, { "epoch": 5.90209163680209, "grad_norm": 17.726642608642578, "learning_rate": 2.0532533269563174e-05, "loss": 2.4552, "step": 2819500 }, { "epoch": 5.903138292527715, "grad_norm": 17.523887634277344, "learning_rate": 2.0527289013072885e-05, "loss": 2.4466, "step": 2820000 }, { "epoch": 5.904184948253341, "grad_norm": 15.691495895385742, "learning_rate": 2.0522044756582593e-05, "loss": 2.4427, "step": 2820500 }, { "epoch": 5.905231603978966, "grad_norm": 17.67746925354004, "learning_rate": 2.05168005000923e-05, "loss": 2.4473, "step": 2821000 }, { "epoch": 5.9062782597045915, "grad_norm": 15.379096031188965, "learning_rate": 2.051155624360201e-05, "loss": 2.4472, "step": 2821500 }, { "epoch": 5.907324915430217, "grad_norm": 16.084840774536133, "learning_rate": 2.0506311987111716e-05, "loss": 2.4626, "step": 2822000 }, { "epoch": 5.908371571155843, "grad_norm": 17.102115631103516, "learning_rate": 2.0501067730621424e-05, "loss": 2.4331, "step": 2822500 }, { "epoch": 5.909418226881469, "grad_norm": 15.1185302734375, "learning_rate": 2.049582347413113e-05, "loss": 2.4372, "step": 2823000 }, { "epoch": 5.910464882607094, "grad_norm": 15.990584373474121, "learning_rate": 2.049057921764084e-05, "loss": 2.4505, "step": 2823500 }, { "epoch": 5.91151153833272, "grad_norm": 16.058767318725586, "learning_rate": 2.0485334961150547e-05, "loss": 2.4722, "step": 2824000 }, { "epoch": 5.912558194058345, "grad_norm": 17.6765193939209, "learning_rate": 2.0480090704660258e-05, "loss": 2.4379, "step": 2824500 }, { "epoch": 5.913604849783971, "grad_norm": 18.262069702148438, "learning_rate": 2.0474846448169966e-05, "loss": 2.447, "step": 2825000 }, { "epoch": 5.914651505509596, "grad_norm": 17.702884674072266, "learning_rate": 2.0469602191679673e-05, "loss": 2.4407, "step": 2825500 }, { "epoch": 5.9156981612352215, "grad_norm": 16.950176239013672, "learning_rate": 2.046435793518938e-05, "loss": 2.448, "step": 2826000 }, { "epoch": 5.916744816960847, "grad_norm": 18.11916160583496, "learning_rate": 2.0459113678699092e-05, "loss": 2.4488, "step": 2826500 }, { "epoch": 5.917791472686472, "grad_norm": 16.793617248535156, "learning_rate": 2.04538694222088e-05, "loss": 2.4565, "step": 2827000 }, { "epoch": 5.918838128412098, "grad_norm": 16.31207275390625, "learning_rate": 2.0448625165718507e-05, "loss": 2.4563, "step": 2827500 }, { "epoch": 5.919884784137723, "grad_norm": 14.888534545898438, "learning_rate": 2.0443380909228212e-05, "loss": 2.4588, "step": 2828000 }, { "epoch": 5.920931439863349, "grad_norm": 18.941667556762695, "learning_rate": 2.0438136652737923e-05, "loss": 2.4591, "step": 2828500 }, { "epoch": 5.921978095588974, "grad_norm": 17.251598358154297, "learning_rate": 2.043289239624763e-05, "loss": 2.4241, "step": 2829000 }, { "epoch": 5.9230247513146, "grad_norm": 16.794071197509766, "learning_rate": 2.0427648139757338e-05, "loss": 2.4548, "step": 2829500 }, { "epoch": 5.924071407040225, "grad_norm": 18.767913818359375, "learning_rate": 2.0422403883267046e-05, "loss": 2.4642, "step": 2830000 }, { "epoch": 5.925118062765851, "grad_norm": 18.79058837890625, "learning_rate": 2.0417159626776754e-05, "loss": 2.4508, "step": 2830500 }, { "epoch": 5.926164718491476, "grad_norm": 15.385478019714355, "learning_rate": 2.0411915370286465e-05, "loss": 2.4523, "step": 2831000 }, { "epoch": 5.9272113742171015, "grad_norm": 14.248677253723145, "learning_rate": 2.0406671113796172e-05, "loss": 2.4489, "step": 2831500 }, { "epoch": 5.928258029942727, "grad_norm": 16.85842514038086, "learning_rate": 2.040142685730588e-05, "loss": 2.4684, "step": 2832000 }, { "epoch": 5.9293046856683524, "grad_norm": 18.54047393798828, "learning_rate": 2.0396182600815588e-05, "loss": 2.4559, "step": 2832500 }, { "epoch": 5.930351341393978, "grad_norm": 16.218542098999023, "learning_rate": 2.03909383443253e-05, "loss": 2.4418, "step": 2833000 }, { "epoch": 5.931397997119603, "grad_norm": 17.41579246520996, "learning_rate": 2.0385694087835003e-05, "loss": 2.4416, "step": 2833500 }, { "epoch": 5.932444652845229, "grad_norm": 19.32930564880371, "learning_rate": 2.038044983134471e-05, "loss": 2.4571, "step": 2834000 }, { "epoch": 5.933491308570854, "grad_norm": 33.286109924316406, "learning_rate": 2.037520557485442e-05, "loss": 2.4477, "step": 2834500 }, { "epoch": 5.93453796429648, "grad_norm": 15.696547508239746, "learning_rate": 2.036996131836413e-05, "loss": 2.4545, "step": 2835000 }, { "epoch": 5.935584620022105, "grad_norm": 13.985383987426758, "learning_rate": 2.0364717061873837e-05, "loss": 2.4571, "step": 2835500 }, { "epoch": 5.936631275747731, "grad_norm": 17.853437423706055, "learning_rate": 2.0359472805383545e-05, "loss": 2.4605, "step": 2836000 }, { "epoch": 5.937677931473356, "grad_norm": 14.69644832611084, "learning_rate": 2.0354228548893253e-05, "loss": 2.4779, "step": 2836500 }, { "epoch": 5.9387245871989816, "grad_norm": 18.797958374023438, "learning_rate": 2.034898429240296e-05, "loss": 2.4454, "step": 2837000 }, { "epoch": 5.939771242924607, "grad_norm": 17.130565643310547, "learning_rate": 2.034374003591267e-05, "loss": 2.4592, "step": 2837500 }, { "epoch": 5.9408178986502325, "grad_norm": 17.541799545288086, "learning_rate": 2.033849577942238e-05, "loss": 2.454, "step": 2838000 }, { "epoch": 5.941864554375858, "grad_norm": 14.95970630645752, "learning_rate": 2.0333251522932087e-05, "loss": 2.4404, "step": 2838500 }, { "epoch": 5.942911210101483, "grad_norm": 15.122697830200195, "learning_rate": 2.032800726644179e-05, "loss": 2.4453, "step": 2839000 }, { "epoch": 5.943957865827109, "grad_norm": 13.400579452514648, "learning_rate": 2.0322763009951502e-05, "loss": 2.452, "step": 2839500 }, { "epoch": 5.945004521552734, "grad_norm": 14.675409317016602, "learning_rate": 2.031751875346121e-05, "loss": 2.4556, "step": 2840000 }, { "epoch": 5.94605117727836, "grad_norm": 20.0396785736084, "learning_rate": 2.0312274496970918e-05, "loss": 2.4439, "step": 2840500 }, { "epoch": 5.947097833003985, "grad_norm": 15.778785705566406, "learning_rate": 2.0307030240480625e-05, "loss": 2.4448, "step": 2841000 }, { "epoch": 5.948144488729611, "grad_norm": 16.865249633789062, "learning_rate": 2.0301785983990333e-05, "loss": 2.443, "step": 2841500 }, { "epoch": 5.949191144455237, "grad_norm": 16.33074951171875, "learning_rate": 2.0296541727500044e-05, "loss": 2.4378, "step": 2842000 }, { "epoch": 5.9502378001808625, "grad_norm": 16.800678253173828, "learning_rate": 2.0291297471009752e-05, "loss": 2.4333, "step": 2842500 }, { "epoch": 5.951284455906488, "grad_norm": 17.055828094482422, "learning_rate": 2.028605321451946e-05, "loss": 2.4517, "step": 2843000 }, { "epoch": 5.952331111632113, "grad_norm": 25.318702697753906, "learning_rate": 2.0280808958029167e-05, "loss": 2.4522, "step": 2843500 }, { "epoch": 5.953377767357739, "grad_norm": 17.58049201965332, "learning_rate": 2.0275564701538878e-05, "loss": 2.4427, "step": 2844000 }, { "epoch": 5.954424423083364, "grad_norm": 15.196013450622559, "learning_rate": 2.0270320445048586e-05, "loss": 2.4216, "step": 2844500 }, { "epoch": 5.95547107880899, "grad_norm": 14.436280250549316, "learning_rate": 2.026507618855829e-05, "loss": 2.4364, "step": 2845000 }, { "epoch": 5.956517734534615, "grad_norm": 14.912725448608398, "learning_rate": 2.0259831932067998e-05, "loss": 2.4456, "step": 2845500 }, { "epoch": 5.957564390260241, "grad_norm": 14.89148235321045, "learning_rate": 2.025458767557771e-05, "loss": 2.4636, "step": 2846000 }, { "epoch": 5.958611045985866, "grad_norm": 21.80938148498535, "learning_rate": 2.0249343419087417e-05, "loss": 2.4498, "step": 2846500 }, { "epoch": 5.959657701711492, "grad_norm": 19.261423110961914, "learning_rate": 2.0244099162597124e-05, "loss": 2.4637, "step": 2847000 }, { "epoch": 5.960704357437117, "grad_norm": 17.18735122680664, "learning_rate": 2.0238854906106832e-05, "loss": 2.4535, "step": 2847500 }, { "epoch": 5.9617510131627425, "grad_norm": 15.618229866027832, "learning_rate": 2.023361064961654e-05, "loss": 2.462, "step": 2848000 }, { "epoch": 5.962797668888368, "grad_norm": 15.768156051635742, "learning_rate": 2.022836639312625e-05, "loss": 2.4602, "step": 2848500 }, { "epoch": 5.963844324613993, "grad_norm": 14.612009048461914, "learning_rate": 2.022312213663596e-05, "loss": 2.4441, "step": 2849000 }, { "epoch": 5.964890980339619, "grad_norm": 16.346324920654297, "learning_rate": 2.0217877880145666e-05, "loss": 2.4408, "step": 2849500 }, { "epoch": 5.965937636065244, "grad_norm": 14.929597854614258, "learning_rate": 2.0212633623655374e-05, "loss": 2.4448, "step": 2850000 }, { "epoch": 5.96698429179087, "grad_norm": 15.553224563598633, "learning_rate": 2.020738936716508e-05, "loss": 2.4512, "step": 2850500 }, { "epoch": 5.968030947516495, "grad_norm": 17.009443283081055, "learning_rate": 2.020214511067479e-05, "loss": 2.4505, "step": 2851000 }, { "epoch": 5.969077603242121, "grad_norm": 17.958555221557617, "learning_rate": 2.0196900854184497e-05, "loss": 2.4765, "step": 2851500 }, { "epoch": 5.970124258967746, "grad_norm": 15.342814445495605, "learning_rate": 2.0191656597694205e-05, "loss": 2.4465, "step": 2852000 }, { "epoch": 5.971170914693372, "grad_norm": 17.222028732299805, "learning_rate": 2.0186412341203916e-05, "loss": 2.4348, "step": 2852500 }, { "epoch": 5.972217570418997, "grad_norm": 15.805734634399414, "learning_rate": 2.0181168084713623e-05, "loss": 2.4434, "step": 2853000 }, { "epoch": 5.9732642261446225, "grad_norm": 21.037376403808594, "learning_rate": 2.017592382822333e-05, "loss": 2.4469, "step": 2853500 }, { "epoch": 5.974310881870248, "grad_norm": 15.613492965698242, "learning_rate": 2.017067957173304e-05, "loss": 2.4429, "step": 2854000 }, { "epoch": 5.975357537595873, "grad_norm": 14.994658470153809, "learning_rate": 2.0165435315242746e-05, "loss": 2.4482, "step": 2854500 }, { "epoch": 5.976404193321499, "grad_norm": 16.74523162841797, "learning_rate": 2.0160191058752458e-05, "loss": 2.4461, "step": 2855000 }, { "epoch": 5.977450849047124, "grad_norm": 16.63080596923828, "learning_rate": 2.0154946802262165e-05, "loss": 2.4504, "step": 2855500 }, { "epoch": 5.97849750477275, "grad_norm": 16.56817054748535, "learning_rate": 2.0149702545771873e-05, "loss": 2.4379, "step": 2856000 }, { "epoch": 5.979544160498375, "grad_norm": 18.95882797241211, "learning_rate": 2.0144458289281577e-05, "loss": 2.4687, "step": 2856500 }, { "epoch": 5.980590816224002, "grad_norm": 18.505474090576172, "learning_rate": 2.013921403279129e-05, "loss": 2.4321, "step": 2857000 }, { "epoch": 5.981637471949627, "grad_norm": 22.55782127380371, "learning_rate": 2.0133969776300996e-05, "loss": 2.4582, "step": 2857500 }, { "epoch": 5.9826841276752525, "grad_norm": 16.694042205810547, "learning_rate": 2.0128725519810704e-05, "loss": 2.4619, "step": 2858000 }, { "epoch": 5.983730783400878, "grad_norm": 15.097858428955078, "learning_rate": 2.012348126332041e-05, "loss": 2.4563, "step": 2858500 }, { "epoch": 5.984777439126503, "grad_norm": 14.758306503295898, "learning_rate": 2.011823700683012e-05, "loss": 2.4566, "step": 2859000 }, { "epoch": 5.985824094852129, "grad_norm": 15.952262878417969, "learning_rate": 2.011299275033983e-05, "loss": 2.4563, "step": 2859500 }, { "epoch": 5.986870750577754, "grad_norm": 16.515174865722656, "learning_rate": 2.0107748493849538e-05, "loss": 2.4366, "step": 2860000 }, { "epoch": 5.98791740630338, "grad_norm": 13.512458801269531, "learning_rate": 2.0102504237359246e-05, "loss": 2.4526, "step": 2860500 }, { "epoch": 5.988964062029005, "grad_norm": 16.066211700439453, "learning_rate": 2.0097259980868953e-05, "loss": 2.4719, "step": 2861000 }, { "epoch": 5.990010717754631, "grad_norm": 17.04929542541504, "learning_rate": 2.0092015724378664e-05, "loss": 2.4525, "step": 2861500 }, { "epoch": 5.991057373480256, "grad_norm": 17.044130325317383, "learning_rate": 2.008677146788837e-05, "loss": 2.4391, "step": 2862000 }, { "epoch": 5.992104029205882, "grad_norm": 20.86850357055664, "learning_rate": 2.0081527211398076e-05, "loss": 2.4508, "step": 2862500 }, { "epoch": 5.993150684931507, "grad_norm": 15.647039413452148, "learning_rate": 2.0076282954907784e-05, "loss": 2.444, "step": 2863000 }, { "epoch": 5.9941973406571325, "grad_norm": 17.489810943603516, "learning_rate": 2.0071038698417495e-05, "loss": 2.4534, "step": 2863500 }, { "epoch": 5.995243996382758, "grad_norm": 20.274492263793945, "learning_rate": 2.0065794441927203e-05, "loss": 2.4361, "step": 2864000 }, { "epoch": 5.996290652108383, "grad_norm": 17.214754104614258, "learning_rate": 2.006055018543691e-05, "loss": 2.448, "step": 2864500 }, { "epoch": 5.997337307834009, "grad_norm": 14.1698579788208, "learning_rate": 2.0055305928946618e-05, "loss": 2.433, "step": 2865000 }, { "epoch": 5.998383963559634, "grad_norm": 18.510509490966797, "learning_rate": 2.0050061672456326e-05, "loss": 2.4419, "step": 2865500 }, { "epoch": 5.99943061928526, "grad_norm": 15.908750534057617, "learning_rate": 2.0044817415966037e-05, "loss": 2.4594, "step": 2866000 }, { "epoch": 6.000477275010885, "grad_norm": 14.705338478088379, "learning_rate": 2.0039573159475745e-05, "loss": 2.4392, "step": 2866500 }, { "epoch": 6.001523930736511, "grad_norm": 16.271528244018555, "learning_rate": 2.0034328902985452e-05, "loss": 2.463, "step": 2867000 }, { "epoch": 6.002570586462136, "grad_norm": 18.33270263671875, "learning_rate": 2.0029084646495157e-05, "loss": 2.4319, "step": 2867500 }, { "epoch": 6.003617242187762, "grad_norm": 17.000259399414062, "learning_rate": 2.0023840390004868e-05, "loss": 2.4534, "step": 2868000 }, { "epoch": 6.004663897913387, "grad_norm": 16.599224090576172, "learning_rate": 2.0018596133514575e-05, "loss": 2.4327, "step": 2868500 }, { "epoch": 6.0057105536390125, "grad_norm": 17.131391525268555, "learning_rate": 2.0013351877024283e-05, "loss": 2.4408, "step": 2869000 }, { "epoch": 6.006757209364638, "grad_norm": 15.46038818359375, "learning_rate": 2.000810762053399e-05, "loss": 2.4423, "step": 2869500 }, { "epoch": 6.007803865090263, "grad_norm": 15.312410354614258, "learning_rate": 2.0002863364043702e-05, "loss": 2.4485, "step": 2870000 }, { "epoch": 6.008850520815889, "grad_norm": 15.328120231628418, "learning_rate": 1.999761910755341e-05, "loss": 2.4534, "step": 2870500 }, { "epoch": 6.009897176541514, "grad_norm": 17.179901123046875, "learning_rate": 1.9992374851063117e-05, "loss": 2.4404, "step": 2871000 }, { "epoch": 6.01094383226714, "grad_norm": 21.254972457885742, "learning_rate": 1.9987130594572825e-05, "loss": 2.4417, "step": 2871500 }, { "epoch": 6.011990487992765, "grad_norm": 18.728464126586914, "learning_rate": 1.9981886338082533e-05, "loss": 2.4332, "step": 2872000 }, { "epoch": 6.013037143718391, "grad_norm": 16.82328987121582, "learning_rate": 1.9976642081592244e-05, "loss": 2.4666, "step": 2872500 }, { "epoch": 6.014083799444016, "grad_norm": 17.013710021972656, "learning_rate": 1.997139782510195e-05, "loss": 2.4332, "step": 2873000 }, { "epoch": 6.015130455169642, "grad_norm": 17.02140235900879, "learning_rate": 1.9966153568611656e-05, "loss": 2.4351, "step": 2873500 }, { "epoch": 6.016177110895267, "grad_norm": 23.459897994995117, "learning_rate": 1.9960909312121363e-05, "loss": 2.4342, "step": 2874000 }, { "epoch": 6.0172237666208925, "grad_norm": 18.473703384399414, "learning_rate": 1.9955665055631074e-05, "loss": 2.4451, "step": 2874500 }, { "epoch": 6.018270422346518, "grad_norm": 14.596941947937012, "learning_rate": 1.9950420799140782e-05, "loss": 2.4646, "step": 2875000 }, { "epoch": 6.0193170780721434, "grad_norm": 16.371240615844727, "learning_rate": 1.994517654265049e-05, "loss": 2.4373, "step": 2875500 }, { "epoch": 6.02036373379777, "grad_norm": 15.350059509277344, "learning_rate": 1.9939932286160198e-05, "loss": 2.4318, "step": 2876000 }, { "epoch": 6.021410389523395, "grad_norm": 17.64289665222168, "learning_rate": 1.9934688029669905e-05, "loss": 2.4427, "step": 2876500 }, { "epoch": 6.022457045249021, "grad_norm": 16.32891082763672, "learning_rate": 1.9929443773179616e-05, "loss": 2.4375, "step": 2877000 }, { "epoch": 6.023503700974646, "grad_norm": 16.469284057617188, "learning_rate": 1.9924199516689324e-05, "loss": 2.4404, "step": 2877500 }, { "epoch": 6.024550356700272, "grad_norm": 16.131662368774414, "learning_rate": 1.991895526019903e-05, "loss": 2.4281, "step": 2878000 }, { "epoch": 6.025597012425897, "grad_norm": 17.55629539489746, "learning_rate": 1.991371100370874e-05, "loss": 2.4414, "step": 2878500 }, { "epoch": 6.0266436681515225, "grad_norm": 16.938247680664062, "learning_rate": 1.9908466747218447e-05, "loss": 2.4347, "step": 2879000 }, { "epoch": 6.027690323877148, "grad_norm": 17.582727432250977, "learning_rate": 1.9903222490728155e-05, "loss": 2.4284, "step": 2879500 }, { "epoch": 6.028736979602773, "grad_norm": 15.124749183654785, "learning_rate": 1.9897978234237862e-05, "loss": 2.4218, "step": 2880000 }, { "epoch": 6.029783635328399, "grad_norm": 16.190261840820312, "learning_rate": 1.989273397774757e-05, "loss": 2.4553, "step": 2880500 }, { "epoch": 6.030830291054024, "grad_norm": 16.332979202270508, "learning_rate": 1.988748972125728e-05, "loss": 2.4214, "step": 2881000 }, { "epoch": 6.03187694677965, "grad_norm": 16.33365821838379, "learning_rate": 1.988224546476699e-05, "loss": 2.4196, "step": 2881500 }, { "epoch": 6.032923602505275, "grad_norm": 16.0023136138916, "learning_rate": 1.9877001208276697e-05, "loss": 2.4325, "step": 2882000 }, { "epoch": 6.033970258230901, "grad_norm": 16.781137466430664, "learning_rate": 1.9871756951786404e-05, "loss": 2.4351, "step": 2882500 }, { "epoch": 6.035016913956526, "grad_norm": 24.881929397583008, "learning_rate": 1.9866512695296112e-05, "loss": 2.4359, "step": 2883000 }, { "epoch": 6.036063569682152, "grad_norm": 16.529470443725586, "learning_rate": 1.9861268438805823e-05, "loss": 2.4378, "step": 2883500 }, { "epoch": 6.037110225407777, "grad_norm": 16.82660675048828, "learning_rate": 1.985602418231553e-05, "loss": 2.4381, "step": 2884000 }, { "epoch": 6.0381568811334025, "grad_norm": 21.258737564086914, "learning_rate": 1.9850779925825235e-05, "loss": 2.4452, "step": 2884500 }, { "epoch": 6.039203536859028, "grad_norm": 15.187554359436035, "learning_rate": 1.9845535669334943e-05, "loss": 2.4405, "step": 2885000 }, { "epoch": 6.0402501925846535, "grad_norm": 18.26313018798828, "learning_rate": 1.9840291412844654e-05, "loss": 2.4347, "step": 2885500 }, { "epoch": 6.041296848310279, "grad_norm": 14.88257884979248, "learning_rate": 1.983504715635436e-05, "loss": 2.4439, "step": 2886000 }, { "epoch": 6.042343504035904, "grad_norm": 18.132444381713867, "learning_rate": 1.982980289986407e-05, "loss": 2.4184, "step": 2886500 }, { "epoch": 6.04339015976153, "grad_norm": 17.6298885345459, "learning_rate": 1.9824558643373777e-05, "loss": 2.4285, "step": 2887000 }, { "epoch": 6.044436815487155, "grad_norm": 17.64301300048828, "learning_rate": 1.9819314386883488e-05, "loss": 2.4322, "step": 2887500 }, { "epoch": 6.045483471212781, "grad_norm": 16.969722747802734, "learning_rate": 1.9814070130393196e-05, "loss": 2.4163, "step": 2888000 }, { "epoch": 6.046530126938406, "grad_norm": 17.212703704833984, "learning_rate": 1.9808825873902903e-05, "loss": 2.4497, "step": 2888500 }, { "epoch": 6.047576782664032, "grad_norm": 16.10140609741211, "learning_rate": 1.980358161741261e-05, "loss": 2.4472, "step": 2889000 }, { "epoch": 6.048623438389657, "grad_norm": 15.239860534667969, "learning_rate": 1.979833736092232e-05, "loss": 2.4388, "step": 2889500 }, { "epoch": 6.049670094115283, "grad_norm": 16.145051956176758, "learning_rate": 1.979309310443203e-05, "loss": 2.4485, "step": 2890000 }, { "epoch": 6.050716749840908, "grad_norm": 15.666581153869629, "learning_rate": 1.9787848847941734e-05, "loss": 2.4321, "step": 2890500 }, { "epoch": 6.0517634055665335, "grad_norm": 14.408674240112305, "learning_rate": 1.9782604591451442e-05, "loss": 2.425, "step": 2891000 }, { "epoch": 6.052810061292159, "grad_norm": 19.389604568481445, "learning_rate": 1.977736033496115e-05, "loss": 2.4279, "step": 2891500 }, { "epoch": 6.053856717017784, "grad_norm": 15.054169654846191, "learning_rate": 1.977211607847086e-05, "loss": 2.4602, "step": 2892000 }, { "epoch": 6.05490337274341, "grad_norm": 16.936317443847656, "learning_rate": 1.9766871821980568e-05, "loss": 2.432, "step": 2892500 }, { "epoch": 6.055950028469036, "grad_norm": 15.186174392700195, "learning_rate": 1.9761627565490276e-05, "loss": 2.4422, "step": 2893000 }, { "epoch": 6.056996684194662, "grad_norm": 16.31912612915039, "learning_rate": 1.9756383308999984e-05, "loss": 2.4355, "step": 2893500 }, { "epoch": 6.058043339920287, "grad_norm": 15.646013259887695, "learning_rate": 1.975113905250969e-05, "loss": 2.4337, "step": 2894000 }, { "epoch": 6.0590899956459126, "grad_norm": 17.776182174682617, "learning_rate": 1.9745894796019402e-05, "loss": 2.4462, "step": 2894500 }, { "epoch": 6.060136651371538, "grad_norm": 16.524660110473633, "learning_rate": 1.974065053952911e-05, "loss": 2.4291, "step": 2895000 }, { "epoch": 6.0611833070971635, "grad_norm": 20.95606231689453, "learning_rate": 1.9735406283038818e-05, "loss": 2.4274, "step": 2895500 }, { "epoch": 6.062229962822789, "grad_norm": 15.08887767791748, "learning_rate": 1.9730162026548522e-05, "loss": 2.4466, "step": 2896000 }, { "epoch": 6.063276618548414, "grad_norm": 18.675146102905273, "learning_rate": 1.9724917770058233e-05, "loss": 2.4281, "step": 2896500 }, { "epoch": 6.06432327427404, "grad_norm": 13.762691497802734, "learning_rate": 1.971967351356794e-05, "loss": 2.4274, "step": 2897000 }, { "epoch": 6.065369929999665, "grad_norm": 17.504762649536133, "learning_rate": 1.971442925707765e-05, "loss": 2.4412, "step": 2897500 }, { "epoch": 6.066416585725291, "grad_norm": 18.178647994995117, "learning_rate": 1.9709185000587356e-05, "loss": 2.4423, "step": 2898000 }, { "epoch": 6.067463241450916, "grad_norm": 17.59202766418457, "learning_rate": 1.9703940744097067e-05, "loss": 2.4439, "step": 2898500 }, { "epoch": 6.068509897176542, "grad_norm": 15.500479698181152, "learning_rate": 1.9698696487606775e-05, "loss": 2.4335, "step": 2899000 }, { "epoch": 6.069556552902167, "grad_norm": 17.9145565032959, "learning_rate": 1.9693452231116483e-05, "loss": 2.4453, "step": 2899500 }, { "epoch": 6.070603208627793, "grad_norm": 15.876246452331543, "learning_rate": 1.968820797462619e-05, "loss": 2.4519, "step": 2900000 }, { "epoch": 6.071649864353418, "grad_norm": 15.781970977783203, "learning_rate": 1.9682963718135898e-05, "loss": 2.4278, "step": 2900500 }, { "epoch": 6.0726965200790435, "grad_norm": 17.717466354370117, "learning_rate": 1.967771946164561e-05, "loss": 2.4121, "step": 2901000 }, { "epoch": 6.073743175804669, "grad_norm": 16.7778377532959, "learning_rate": 1.9672475205155314e-05, "loss": 2.4516, "step": 2901500 }, { "epoch": 6.074789831530294, "grad_norm": 18.525312423706055, "learning_rate": 1.966723094866502e-05, "loss": 2.4198, "step": 2902000 }, { "epoch": 6.07583648725592, "grad_norm": 14.622900009155273, "learning_rate": 1.966198669217473e-05, "loss": 2.4358, "step": 2902500 }, { "epoch": 6.076883142981545, "grad_norm": 16.147186279296875, "learning_rate": 1.965674243568444e-05, "loss": 2.4303, "step": 2903000 }, { "epoch": 6.077929798707171, "grad_norm": 16.840621948242188, "learning_rate": 1.9651498179194148e-05, "loss": 2.4299, "step": 2903500 }, { "epoch": 6.078976454432796, "grad_norm": 15.320660591125488, "learning_rate": 1.9646253922703855e-05, "loss": 2.4445, "step": 2904000 }, { "epoch": 6.080023110158422, "grad_norm": 16.25324821472168, "learning_rate": 1.9641009666213563e-05, "loss": 2.451, "step": 2904500 }, { "epoch": 6.081069765884047, "grad_norm": 16.4992733001709, "learning_rate": 1.963576540972327e-05, "loss": 2.4438, "step": 2905000 }, { "epoch": 6.082116421609673, "grad_norm": 18.13588523864746, "learning_rate": 1.9630521153232982e-05, "loss": 2.4244, "step": 2905500 }, { "epoch": 6.083163077335298, "grad_norm": 18.269309997558594, "learning_rate": 1.962527689674269e-05, "loss": 2.436, "step": 2906000 }, { "epoch": 6.0842097330609235, "grad_norm": 16.353551864624023, "learning_rate": 1.9620032640252397e-05, "loss": 2.4327, "step": 2906500 }, { "epoch": 6.085256388786549, "grad_norm": 18.078433990478516, "learning_rate": 1.9614788383762105e-05, "loss": 2.4415, "step": 2907000 }, { "epoch": 6.086303044512174, "grad_norm": 18.155553817749023, "learning_rate": 1.9609544127271813e-05, "loss": 2.4326, "step": 2907500 }, { "epoch": 6.0873497002378, "grad_norm": 17.1566162109375, "learning_rate": 1.960429987078152e-05, "loss": 2.4585, "step": 2908000 }, { "epoch": 6.088396355963425, "grad_norm": 17.25872230529785, "learning_rate": 1.9599055614291228e-05, "loss": 2.4352, "step": 2908500 }, { "epoch": 6.089443011689051, "grad_norm": 16.1490421295166, "learning_rate": 1.9593811357800936e-05, "loss": 2.4302, "step": 2909000 }, { "epoch": 6.090489667414676, "grad_norm": 16.734054565429688, "learning_rate": 1.9588567101310647e-05, "loss": 2.441, "step": 2909500 }, { "epoch": 6.091536323140302, "grad_norm": 13.55341911315918, "learning_rate": 1.9583322844820354e-05, "loss": 2.4293, "step": 2910000 }, { "epoch": 6.092582978865927, "grad_norm": 16.632841110229492, "learning_rate": 1.9578078588330062e-05, "loss": 2.4516, "step": 2910500 }, { "epoch": 6.0936296345915535, "grad_norm": 17.596210479736328, "learning_rate": 1.957283433183977e-05, "loss": 2.4323, "step": 2911000 }, { "epoch": 6.094676290317179, "grad_norm": 18.092676162719727, "learning_rate": 1.9567590075349477e-05, "loss": 2.4488, "step": 2911500 }, { "epoch": 6.095722946042804, "grad_norm": 19.064258575439453, "learning_rate": 1.956234581885919e-05, "loss": 2.4374, "step": 2912000 }, { "epoch": 6.09676960176843, "grad_norm": 18.634117126464844, "learning_rate": 1.9557101562368896e-05, "loss": 2.4189, "step": 2912500 }, { "epoch": 6.097816257494055, "grad_norm": 18.1982364654541, "learning_rate": 1.95518573058786e-05, "loss": 2.4402, "step": 2913000 }, { "epoch": 6.098862913219681, "grad_norm": 16.498014450073242, "learning_rate": 1.9546613049388308e-05, "loss": 2.4613, "step": 2913500 }, { "epoch": 6.099909568945306, "grad_norm": 17.369213104248047, "learning_rate": 1.954136879289802e-05, "loss": 2.4169, "step": 2914000 }, { "epoch": 6.100956224670932, "grad_norm": 16.07923698425293, "learning_rate": 1.9536124536407727e-05, "loss": 2.4329, "step": 2914500 }, { "epoch": 6.102002880396557, "grad_norm": 16.55069351196289, "learning_rate": 1.9530880279917435e-05, "loss": 2.4533, "step": 2915000 }, { "epoch": 6.103049536122183, "grad_norm": 19.34941291809082, "learning_rate": 1.9525636023427142e-05, "loss": 2.4245, "step": 2915500 }, { "epoch": 6.104096191847808, "grad_norm": 17.67323875427246, "learning_rate": 1.9520391766936853e-05, "loss": 2.4338, "step": 2916000 }, { "epoch": 6.1051428475734335, "grad_norm": 16.282155990600586, "learning_rate": 1.951514751044656e-05, "loss": 2.439, "step": 2916500 }, { "epoch": 6.106189503299059, "grad_norm": 16.792545318603516, "learning_rate": 1.950990325395627e-05, "loss": 2.4472, "step": 2917000 }, { "epoch": 6.107236159024684, "grad_norm": 14.357569694519043, "learning_rate": 1.9504658997465977e-05, "loss": 2.4203, "step": 2917500 }, { "epoch": 6.10828281475031, "grad_norm": 14.232142448425293, "learning_rate": 1.9499414740975684e-05, "loss": 2.4593, "step": 2918000 }, { "epoch": 6.109329470475935, "grad_norm": 13.5386323928833, "learning_rate": 1.9494170484485392e-05, "loss": 2.4434, "step": 2918500 }, { "epoch": 6.110376126201561, "grad_norm": 14.241798400878906, "learning_rate": 1.94889262279951e-05, "loss": 2.4533, "step": 2919000 }, { "epoch": 6.111422781927186, "grad_norm": 19.479663848876953, "learning_rate": 1.9483681971504807e-05, "loss": 2.4468, "step": 2919500 }, { "epoch": 6.112469437652812, "grad_norm": 18.045412063598633, "learning_rate": 1.9478437715014515e-05, "loss": 2.4311, "step": 2920000 }, { "epoch": 6.113516093378437, "grad_norm": 13.864274978637695, "learning_rate": 1.9473193458524226e-05, "loss": 2.4452, "step": 2920500 }, { "epoch": 6.114562749104063, "grad_norm": 17.444766998291016, "learning_rate": 1.9467949202033934e-05, "loss": 2.4251, "step": 2921000 }, { "epoch": 6.115609404829688, "grad_norm": 16.37126922607422, "learning_rate": 1.946270494554364e-05, "loss": 2.4339, "step": 2921500 }, { "epoch": 6.1166560605553135, "grad_norm": 16.260652542114258, "learning_rate": 1.945746068905335e-05, "loss": 2.4387, "step": 2922000 }, { "epoch": 6.117702716280939, "grad_norm": 15.886009216308594, "learning_rate": 1.9452216432563057e-05, "loss": 2.4338, "step": 2922500 }, { "epoch": 6.118749372006564, "grad_norm": 20.30885887145996, "learning_rate": 1.9446972176072768e-05, "loss": 2.42, "step": 2923000 }, { "epoch": 6.11979602773219, "grad_norm": 14.15482234954834, "learning_rate": 1.9441727919582476e-05, "loss": 2.4217, "step": 2923500 }, { "epoch": 6.120842683457815, "grad_norm": 18.278423309326172, "learning_rate": 1.9436483663092183e-05, "loss": 2.4467, "step": 2924000 }, { "epoch": 6.121889339183441, "grad_norm": 14.827624320983887, "learning_rate": 1.943123940660189e-05, "loss": 2.433, "step": 2924500 }, { "epoch": 6.122935994909066, "grad_norm": 16.801342010498047, "learning_rate": 1.94259951501116e-05, "loss": 2.4374, "step": 2925000 }, { "epoch": 6.123982650634692, "grad_norm": 17.169063568115234, "learning_rate": 1.9420750893621306e-05, "loss": 2.4145, "step": 2925500 }, { "epoch": 6.125029306360317, "grad_norm": 15.574470520019531, "learning_rate": 1.9415506637131014e-05, "loss": 2.4309, "step": 2926000 }, { "epoch": 6.126075962085943, "grad_norm": 19.62022590637207, "learning_rate": 1.9410262380640722e-05, "loss": 2.4508, "step": 2926500 }, { "epoch": 6.127122617811568, "grad_norm": 14.061903953552246, "learning_rate": 1.9405018124150433e-05, "loss": 2.4226, "step": 2927000 }, { "epoch": 6.1281692735371935, "grad_norm": 17.85190200805664, "learning_rate": 1.939977386766014e-05, "loss": 2.4387, "step": 2927500 }, { "epoch": 6.12921592926282, "grad_norm": 15.62004566192627, "learning_rate": 1.9394529611169848e-05, "loss": 2.4418, "step": 2928000 }, { "epoch": 6.130262584988445, "grad_norm": 16.183889389038086, "learning_rate": 1.9389285354679556e-05, "loss": 2.4427, "step": 2928500 }, { "epoch": 6.131309240714071, "grad_norm": 16.15369415283203, "learning_rate": 1.9384041098189264e-05, "loss": 2.4412, "step": 2929000 }, { "epoch": 6.132355896439696, "grad_norm": 18.963090896606445, "learning_rate": 1.9378796841698975e-05, "loss": 2.454, "step": 2929500 }, { "epoch": 6.133402552165322, "grad_norm": 18.34490394592285, "learning_rate": 1.937355258520868e-05, "loss": 2.4415, "step": 2930000 }, { "epoch": 6.134449207890947, "grad_norm": 16.532224655151367, "learning_rate": 1.9368308328718387e-05, "loss": 2.4423, "step": 2930500 }, { "epoch": 6.135495863616573, "grad_norm": 16.534717559814453, "learning_rate": 1.9363064072228094e-05, "loss": 2.4402, "step": 2931000 }, { "epoch": 6.136542519342198, "grad_norm": 20.179302215576172, "learning_rate": 1.9357819815737805e-05, "loss": 2.4452, "step": 2931500 }, { "epoch": 6.1375891750678235, "grad_norm": 18.104455947875977, "learning_rate": 1.9352575559247513e-05, "loss": 2.4401, "step": 2932000 }, { "epoch": 6.138635830793449, "grad_norm": 17.048437118530273, "learning_rate": 1.934733130275722e-05, "loss": 2.4366, "step": 2932500 }, { "epoch": 6.1396824865190744, "grad_norm": 16.024799346923828, "learning_rate": 1.934208704626693e-05, "loss": 2.4489, "step": 2933000 }, { "epoch": 6.1407291422447, "grad_norm": 16.779247283935547, "learning_rate": 1.933684278977664e-05, "loss": 2.4438, "step": 2933500 }, { "epoch": 6.141775797970325, "grad_norm": 16.327415466308594, "learning_rate": 1.9331598533286347e-05, "loss": 2.4221, "step": 2934000 }, { "epoch": 6.142822453695951, "grad_norm": 15.253149032592773, "learning_rate": 1.9326354276796055e-05, "loss": 2.4559, "step": 2934500 }, { "epoch": 6.143869109421576, "grad_norm": 16.568714141845703, "learning_rate": 1.9321110020305763e-05, "loss": 2.4383, "step": 2935000 }, { "epoch": 6.144915765147202, "grad_norm": 18.95012664794922, "learning_rate": 1.931586576381547e-05, "loss": 2.4463, "step": 2935500 }, { "epoch": 6.145962420872827, "grad_norm": 17.420608520507812, "learning_rate": 1.9310621507325178e-05, "loss": 2.4255, "step": 2936000 }, { "epoch": 6.147009076598453, "grad_norm": 17.396209716796875, "learning_rate": 1.9305377250834886e-05, "loss": 2.4477, "step": 2936500 }, { "epoch": 6.148055732324078, "grad_norm": 15.007761001586914, "learning_rate": 1.9300132994344593e-05, "loss": 2.4541, "step": 2937000 }, { "epoch": 6.1491023880497035, "grad_norm": 17.499584197998047, "learning_rate": 1.92948887378543e-05, "loss": 2.4367, "step": 2937500 }, { "epoch": 6.150149043775329, "grad_norm": 14.264616966247559, "learning_rate": 1.9289644481364012e-05, "loss": 2.4472, "step": 2938000 }, { "epoch": 6.1511956995009545, "grad_norm": 16.155807495117188, "learning_rate": 1.928440022487372e-05, "loss": 2.4456, "step": 2938500 }, { "epoch": 6.15224235522658, "grad_norm": 16.874454498291016, "learning_rate": 1.9279155968383428e-05, "loss": 2.4277, "step": 2939000 }, { "epoch": 6.153289010952205, "grad_norm": 19.56464958190918, "learning_rate": 1.9273911711893135e-05, "loss": 2.4419, "step": 2939500 }, { "epoch": 6.154335666677831, "grad_norm": 18.318790435791016, "learning_rate": 1.9268667455402843e-05, "loss": 2.4044, "step": 2940000 }, { "epoch": 6.155382322403456, "grad_norm": 16.21206283569336, "learning_rate": 1.9263423198912554e-05, "loss": 2.4251, "step": 2940500 }, { "epoch": 6.156428978129082, "grad_norm": 18.925201416015625, "learning_rate": 1.9258178942422262e-05, "loss": 2.4276, "step": 2941000 }, { "epoch": 6.157475633854707, "grad_norm": 15.069512367248535, "learning_rate": 1.9252934685931966e-05, "loss": 2.4187, "step": 2941500 }, { "epoch": 6.158522289580333, "grad_norm": 16.66951560974121, "learning_rate": 1.9247690429441674e-05, "loss": 2.4442, "step": 2942000 }, { "epoch": 6.159568945305958, "grad_norm": 22.584047317504883, "learning_rate": 1.9242446172951385e-05, "loss": 2.4397, "step": 2942500 }, { "epoch": 6.160615601031584, "grad_norm": 14.48738956451416, "learning_rate": 1.9237201916461093e-05, "loss": 2.4322, "step": 2943000 }, { "epoch": 6.161662256757209, "grad_norm": 17.443220138549805, "learning_rate": 1.92319576599708e-05, "loss": 2.4267, "step": 2943500 }, { "epoch": 6.1627089124828345, "grad_norm": 16.295751571655273, "learning_rate": 1.9226713403480508e-05, "loss": 2.4574, "step": 2944000 }, { "epoch": 6.16375556820846, "grad_norm": 16.909223556518555, "learning_rate": 1.922146914699022e-05, "loss": 2.4368, "step": 2944500 }, { "epoch": 6.164802223934085, "grad_norm": 17.64588165283203, "learning_rate": 1.9216224890499927e-05, "loss": 2.4333, "step": 2945000 }, { "epoch": 6.165848879659711, "grad_norm": 17.114439010620117, "learning_rate": 1.9210980634009634e-05, "loss": 2.4263, "step": 2945500 }, { "epoch": 6.166895535385337, "grad_norm": 17.369596481323242, "learning_rate": 1.9205736377519342e-05, "loss": 2.4462, "step": 2946000 }, { "epoch": 6.167942191110963, "grad_norm": 21.060468673706055, "learning_rate": 1.920049212102905e-05, "loss": 2.4075, "step": 2946500 }, { "epoch": 6.168988846836588, "grad_norm": 18.679216384887695, "learning_rate": 1.9195247864538757e-05, "loss": 2.4521, "step": 2947000 }, { "epoch": 6.170035502562214, "grad_norm": 18.29163360595703, "learning_rate": 1.9190003608048465e-05, "loss": 2.4516, "step": 2947500 }, { "epoch": 6.171082158287839, "grad_norm": 15.548918724060059, "learning_rate": 1.9184759351558173e-05, "loss": 2.4392, "step": 2948000 }, { "epoch": 6.1721288140134645, "grad_norm": 17.700706481933594, "learning_rate": 1.917951509506788e-05, "loss": 2.4308, "step": 2948500 }, { "epoch": 6.17317546973909, "grad_norm": 15.898836135864258, "learning_rate": 1.917427083857759e-05, "loss": 2.4261, "step": 2949000 }, { "epoch": 6.174222125464715, "grad_norm": 17.529891967773438, "learning_rate": 1.91690265820873e-05, "loss": 2.448, "step": 2949500 }, { "epoch": 6.175268781190341, "grad_norm": 15.595376014709473, "learning_rate": 1.9163782325597007e-05, "loss": 2.44, "step": 2950000 }, { "epoch": 6.176315436915966, "grad_norm": 18.548181533813477, "learning_rate": 1.9158538069106715e-05, "loss": 2.4299, "step": 2950500 }, { "epoch": 6.177362092641592, "grad_norm": 16.633625030517578, "learning_rate": 1.9153293812616426e-05, "loss": 2.4446, "step": 2951000 }, { "epoch": 6.178408748367217, "grad_norm": 18.654043197631836, "learning_rate": 1.9148049556126133e-05, "loss": 2.4295, "step": 2951500 }, { "epoch": 6.179455404092843, "grad_norm": 18.3819522857666, "learning_rate": 1.914280529963584e-05, "loss": 2.4201, "step": 2952000 }, { "epoch": 6.180502059818468, "grad_norm": 34.3435173034668, "learning_rate": 1.9137561043145545e-05, "loss": 2.4328, "step": 2952500 }, { "epoch": 6.181548715544094, "grad_norm": 16.94193458557129, "learning_rate": 1.9132316786655257e-05, "loss": 2.4274, "step": 2953000 }, { "epoch": 6.182595371269719, "grad_norm": 21.06448745727539, "learning_rate": 1.9127072530164964e-05, "loss": 2.4219, "step": 2953500 }, { "epoch": 6.1836420269953445, "grad_norm": 14.087323188781738, "learning_rate": 1.9121828273674672e-05, "loss": 2.4432, "step": 2954000 }, { "epoch": 6.18468868272097, "grad_norm": 14.051115036010742, "learning_rate": 1.911658401718438e-05, "loss": 2.4232, "step": 2954500 }, { "epoch": 6.185735338446595, "grad_norm": 17.510805130004883, "learning_rate": 1.9111339760694087e-05, "loss": 2.4302, "step": 2955000 }, { "epoch": 6.186781994172221, "grad_norm": 16.959653854370117, "learning_rate": 1.91060955042038e-05, "loss": 2.4329, "step": 2955500 }, { "epoch": 6.187828649897846, "grad_norm": 18.094308853149414, "learning_rate": 1.9100851247713506e-05, "loss": 2.4489, "step": 2956000 }, { "epoch": 6.188875305623472, "grad_norm": 14.914125442504883, "learning_rate": 1.9095606991223214e-05, "loss": 2.4405, "step": 2956500 }, { "epoch": 6.189921961349097, "grad_norm": 17.642297744750977, "learning_rate": 1.909036273473292e-05, "loss": 2.4371, "step": 2957000 }, { "epoch": 6.190968617074723, "grad_norm": 15.693922996520996, "learning_rate": 1.908511847824263e-05, "loss": 2.4302, "step": 2957500 }, { "epoch": 6.192015272800348, "grad_norm": 16.197755813598633, "learning_rate": 1.907987422175234e-05, "loss": 2.4234, "step": 2958000 }, { "epoch": 6.193061928525974, "grad_norm": 17.558073043823242, "learning_rate": 1.9074629965262045e-05, "loss": 2.4126, "step": 2958500 }, { "epoch": 6.194108584251599, "grad_norm": 17.73297882080078, "learning_rate": 1.9069385708771752e-05, "loss": 2.4337, "step": 2959000 }, { "epoch": 6.1951552399772245, "grad_norm": 16.84465789794922, "learning_rate": 1.906414145228146e-05, "loss": 2.4262, "step": 2959500 }, { "epoch": 6.19620189570285, "grad_norm": 18.88178825378418, "learning_rate": 1.905889719579117e-05, "loss": 2.4115, "step": 2960000 }, { "epoch": 6.197248551428475, "grad_norm": 14.709548950195312, "learning_rate": 1.905365293930088e-05, "loss": 2.4163, "step": 2960500 }, { "epoch": 6.198295207154101, "grad_norm": 18.063810348510742, "learning_rate": 1.9048408682810586e-05, "loss": 2.4567, "step": 2961000 }, { "epoch": 6.199341862879726, "grad_norm": 16.683774948120117, "learning_rate": 1.9043164426320294e-05, "loss": 2.4483, "step": 2961500 }, { "epoch": 6.200388518605352, "grad_norm": 16.57549476623535, "learning_rate": 1.9037920169830005e-05, "loss": 2.428, "step": 2962000 }, { "epoch": 6.201435174330977, "grad_norm": 14.665607452392578, "learning_rate": 1.9032675913339713e-05, "loss": 2.4194, "step": 2962500 }, { "epoch": 6.202481830056604, "grad_norm": 14.673174858093262, "learning_rate": 1.902743165684942e-05, "loss": 2.4369, "step": 2963000 }, { "epoch": 6.203528485782229, "grad_norm": 14.716745376586914, "learning_rate": 1.9022187400359128e-05, "loss": 2.4421, "step": 2963500 }, { "epoch": 6.2045751415078545, "grad_norm": 17.270349502563477, "learning_rate": 1.9016943143868836e-05, "loss": 2.4334, "step": 2964000 }, { "epoch": 6.20562179723348, "grad_norm": 16.420063018798828, "learning_rate": 1.9011698887378544e-05, "loss": 2.4388, "step": 2964500 }, { "epoch": 6.206668452959105, "grad_norm": 16.48206901550293, "learning_rate": 1.900645463088825e-05, "loss": 2.4522, "step": 2965000 }, { "epoch": 6.207715108684731, "grad_norm": 16.69601058959961, "learning_rate": 1.900121037439796e-05, "loss": 2.4513, "step": 2965500 }, { "epoch": 6.208761764410356, "grad_norm": 18.58155059814453, "learning_rate": 1.8995966117907667e-05, "loss": 2.4239, "step": 2966000 }, { "epoch": 6.209808420135982, "grad_norm": 14.563716888427734, "learning_rate": 1.8990721861417378e-05, "loss": 2.445, "step": 2966500 }, { "epoch": 6.210855075861607, "grad_norm": 16.376646041870117, "learning_rate": 1.8985477604927085e-05, "loss": 2.4372, "step": 2967000 }, { "epoch": 6.211901731587233, "grad_norm": 17.40319061279297, "learning_rate": 1.8980233348436793e-05, "loss": 2.4503, "step": 2967500 }, { "epoch": 6.212948387312858, "grad_norm": 17.76449966430664, "learning_rate": 1.89749890919465e-05, "loss": 2.4174, "step": 2968000 }, { "epoch": 6.213995043038484, "grad_norm": 17.42402458190918, "learning_rate": 1.896974483545621e-05, "loss": 2.4309, "step": 2968500 }, { "epoch": 6.215041698764109, "grad_norm": 16.255598068237305, "learning_rate": 1.896450057896592e-05, "loss": 2.4414, "step": 2969000 }, { "epoch": 6.2160883544897345, "grad_norm": 18.15766143798828, "learning_rate": 1.8959256322475624e-05, "loss": 2.4366, "step": 2969500 }, { "epoch": 6.21713501021536, "grad_norm": 16.921428680419922, "learning_rate": 1.895401206598533e-05, "loss": 2.4407, "step": 2970000 }, { "epoch": 6.218181665940985, "grad_norm": 16.124046325683594, "learning_rate": 1.8948767809495043e-05, "loss": 2.4367, "step": 2970500 }, { "epoch": 6.219228321666611, "grad_norm": 16.93881607055664, "learning_rate": 1.894352355300475e-05, "loss": 2.4259, "step": 2971000 }, { "epoch": 6.220274977392236, "grad_norm": 18.941768646240234, "learning_rate": 1.8938279296514458e-05, "loss": 2.427, "step": 2971500 }, { "epoch": 6.221321633117862, "grad_norm": 15.609354019165039, "learning_rate": 1.8933035040024166e-05, "loss": 2.455, "step": 2972000 }, { "epoch": 6.222368288843487, "grad_norm": 14.543388366699219, "learning_rate": 1.8927790783533873e-05, "loss": 2.419, "step": 2972500 }, { "epoch": 6.223414944569113, "grad_norm": 16.322601318359375, "learning_rate": 1.8922546527043584e-05, "loss": 2.4362, "step": 2973000 }, { "epoch": 6.224461600294738, "grad_norm": 14.85899829864502, "learning_rate": 1.8917302270553292e-05, "loss": 2.4368, "step": 2973500 }, { "epoch": 6.225508256020364, "grad_norm": 19.318819046020508, "learning_rate": 1.8912058014063e-05, "loss": 2.4256, "step": 2974000 }, { "epoch": 6.226554911745989, "grad_norm": 16.87664794921875, "learning_rate": 1.8906813757572708e-05, "loss": 2.4347, "step": 2974500 }, { "epoch": 6.2276015674716145, "grad_norm": 15.81983470916748, "learning_rate": 1.8901569501082415e-05, "loss": 2.42, "step": 2975000 }, { "epoch": 6.22864822319724, "grad_norm": 14.627345085144043, "learning_rate": 1.8896325244592123e-05, "loss": 2.4374, "step": 2975500 }, { "epoch": 6.229694878922865, "grad_norm": 15.79505729675293, "learning_rate": 1.889108098810183e-05, "loss": 2.4358, "step": 2976000 }, { "epoch": 6.230741534648491, "grad_norm": 14.716205596923828, "learning_rate": 1.888583673161154e-05, "loss": 2.4514, "step": 2976500 }, { "epoch": 6.231788190374116, "grad_norm": 17.135643005371094, "learning_rate": 1.8880592475121246e-05, "loss": 2.4155, "step": 2977000 }, { "epoch": 6.232834846099742, "grad_norm": 15.679422378540039, "learning_rate": 1.8875348218630957e-05, "loss": 2.4129, "step": 2977500 }, { "epoch": 6.233881501825367, "grad_norm": 15.83415412902832, "learning_rate": 1.8870103962140665e-05, "loss": 2.4427, "step": 2978000 }, { "epoch": 6.234928157550993, "grad_norm": 17.70229721069336, "learning_rate": 1.8864859705650372e-05, "loss": 2.464, "step": 2978500 }, { "epoch": 6.235974813276618, "grad_norm": 14.528292655944824, "learning_rate": 1.885961544916008e-05, "loss": 2.4305, "step": 2979000 }, { "epoch": 6.237021469002244, "grad_norm": 15.457751274108887, "learning_rate": 1.885437119266979e-05, "loss": 2.4303, "step": 2979500 }, { "epoch": 6.238068124727869, "grad_norm": 16.910253524780273, "learning_rate": 1.88491269361795e-05, "loss": 2.4253, "step": 2980000 }, { "epoch": 6.2391147804534945, "grad_norm": 16.010883331298828, "learning_rate": 1.8843882679689207e-05, "loss": 2.4275, "step": 2980500 }, { "epoch": 6.240161436179121, "grad_norm": 19.052978515625, "learning_rate": 1.883863842319891e-05, "loss": 2.4455, "step": 2981000 }, { "epoch": 6.241208091904746, "grad_norm": 17.917083740234375, "learning_rate": 1.8833394166708622e-05, "loss": 2.4385, "step": 2981500 }, { "epoch": 6.242254747630372, "grad_norm": 16.239063262939453, "learning_rate": 1.882814991021833e-05, "loss": 2.4208, "step": 2982000 }, { "epoch": 6.243301403355997, "grad_norm": 14.1856689453125, "learning_rate": 1.8822905653728037e-05, "loss": 2.4273, "step": 2982500 }, { "epoch": 6.244348059081623, "grad_norm": 17.336402893066406, "learning_rate": 1.8817661397237745e-05, "loss": 2.4271, "step": 2983000 }, { "epoch": 6.245394714807248, "grad_norm": 16.340383529663086, "learning_rate": 1.8812417140747453e-05, "loss": 2.4336, "step": 2983500 }, { "epoch": 6.246441370532874, "grad_norm": 20.590044021606445, "learning_rate": 1.8807172884257164e-05, "loss": 2.4327, "step": 2984000 }, { "epoch": 6.247488026258499, "grad_norm": 17.891014099121094, "learning_rate": 1.880192862776687e-05, "loss": 2.4321, "step": 2984500 }, { "epoch": 6.2485346819841245, "grad_norm": 15.1126070022583, "learning_rate": 1.879668437127658e-05, "loss": 2.4383, "step": 2985000 }, { "epoch": 6.24958133770975, "grad_norm": 15.773385047912598, "learning_rate": 1.8791440114786287e-05, "loss": 2.4569, "step": 2985500 }, { "epoch": 6.2506279934353755, "grad_norm": 15.79201889038086, "learning_rate": 1.8786195858295995e-05, "loss": 2.4129, "step": 2986000 }, { "epoch": 6.251674649161001, "grad_norm": 18.591014862060547, "learning_rate": 1.8780951601805706e-05, "loss": 2.4054, "step": 2986500 }, { "epoch": 6.252721304886626, "grad_norm": 18.246713638305664, "learning_rate": 1.877570734531541e-05, "loss": 2.4227, "step": 2987000 }, { "epoch": 6.253767960612252, "grad_norm": 15.06279182434082, "learning_rate": 1.8770463088825118e-05, "loss": 2.4132, "step": 2987500 }, { "epoch": 6.254814616337877, "grad_norm": 18.364978790283203, "learning_rate": 1.876521883233483e-05, "loss": 2.4243, "step": 2988000 }, { "epoch": 6.255861272063503, "grad_norm": 15.707456588745117, "learning_rate": 1.8759974575844536e-05, "loss": 2.42, "step": 2988500 }, { "epoch": 6.256907927789128, "grad_norm": 18.719533920288086, "learning_rate": 1.8754730319354244e-05, "loss": 2.451, "step": 2989000 }, { "epoch": 6.257954583514754, "grad_norm": 17.173086166381836, "learning_rate": 1.8749486062863952e-05, "loss": 2.4134, "step": 2989500 }, { "epoch": 6.259001239240379, "grad_norm": 18.05629539489746, "learning_rate": 1.874424180637366e-05, "loss": 2.4366, "step": 2990000 }, { "epoch": 6.2600478949660046, "grad_norm": 15.150979042053223, "learning_rate": 1.873899754988337e-05, "loss": 2.4422, "step": 2990500 }, { "epoch": 6.26109455069163, "grad_norm": 16.534208297729492, "learning_rate": 1.873375329339308e-05, "loss": 2.4519, "step": 2991000 }, { "epoch": 6.2621412064172555, "grad_norm": 17.04361343383789, "learning_rate": 1.8728509036902786e-05, "loss": 2.431, "step": 2991500 }, { "epoch": 6.263187862142881, "grad_norm": 20.236875534057617, "learning_rate": 1.8723264780412494e-05, "loss": 2.4094, "step": 2992000 }, { "epoch": 6.264234517868506, "grad_norm": 17.395029067993164, "learning_rate": 1.87180205239222e-05, "loss": 2.4287, "step": 2992500 }, { "epoch": 6.265281173594132, "grad_norm": 15.31084156036377, "learning_rate": 1.871277626743191e-05, "loss": 2.4165, "step": 2993000 }, { "epoch": 6.266327829319757, "grad_norm": 17.50372314453125, "learning_rate": 1.8707532010941617e-05, "loss": 2.4268, "step": 2993500 }, { "epoch": 6.267374485045383, "grad_norm": 15.790592193603516, "learning_rate": 1.8702287754451324e-05, "loss": 2.4371, "step": 2994000 }, { "epoch": 6.268421140771008, "grad_norm": 17.5932559967041, "learning_rate": 1.8697043497961032e-05, "loss": 2.413, "step": 2994500 }, { "epoch": 6.269467796496634, "grad_norm": 16.999696731567383, "learning_rate": 1.8691799241470743e-05, "loss": 2.4222, "step": 2995000 }, { "epoch": 6.270514452222259, "grad_norm": 17.064233779907227, "learning_rate": 1.868655498498045e-05, "loss": 2.4118, "step": 2995500 }, { "epoch": 6.271561107947885, "grad_norm": 13.742898941040039, "learning_rate": 1.868131072849016e-05, "loss": 2.4333, "step": 2996000 }, { "epoch": 6.27260776367351, "grad_norm": 15.007424354553223, "learning_rate": 1.8676066471999866e-05, "loss": 2.426, "step": 2996500 }, { "epoch": 6.2736544193991355, "grad_norm": 16.22843360900879, "learning_rate": 1.8670822215509577e-05, "loss": 2.4321, "step": 2997000 }, { "epoch": 6.274701075124762, "grad_norm": 18.738584518432617, "learning_rate": 1.8665577959019285e-05, "loss": 2.4138, "step": 2997500 }, { "epoch": 6.275747730850387, "grad_norm": 21.78716278076172, "learning_rate": 1.866033370252899e-05, "loss": 2.4351, "step": 2998000 }, { "epoch": 6.276794386576013, "grad_norm": 18.724220275878906, "learning_rate": 1.8655089446038697e-05, "loss": 2.4435, "step": 2998500 }, { "epoch": 6.277841042301638, "grad_norm": 17.09122657775879, "learning_rate": 1.8649845189548408e-05, "loss": 2.4306, "step": 2999000 }, { "epoch": 6.278887698027264, "grad_norm": 16.188060760498047, "learning_rate": 1.8644600933058116e-05, "loss": 2.4414, "step": 2999500 }, { "epoch": 6.279934353752889, "grad_norm": 17.144289016723633, "learning_rate": 1.8639356676567824e-05, "loss": 2.4184, "step": 3000000 }, { "epoch": 6.280981009478515, "grad_norm": 15.327481269836426, "learning_rate": 1.863411242007753e-05, "loss": 2.4284, "step": 3000500 }, { "epoch": 6.28202766520414, "grad_norm": 16.70142364501953, "learning_rate": 1.862886816358724e-05, "loss": 2.4348, "step": 3001000 }, { "epoch": 6.2830743209297655, "grad_norm": 19.36049461364746, "learning_rate": 1.862362390709695e-05, "loss": 2.4283, "step": 3001500 }, { "epoch": 6.284120976655391, "grad_norm": 19.465219497680664, "learning_rate": 1.8618379650606658e-05, "loss": 2.4406, "step": 3002000 }, { "epoch": 6.285167632381016, "grad_norm": 19.38762664794922, "learning_rate": 1.8613135394116365e-05, "loss": 2.4162, "step": 3002500 }, { "epoch": 6.286214288106642, "grad_norm": 15.83216667175293, "learning_rate": 1.8607891137626073e-05, "loss": 2.4245, "step": 3003000 }, { "epoch": 6.287260943832267, "grad_norm": 18.628454208374023, "learning_rate": 1.860264688113578e-05, "loss": 2.4291, "step": 3003500 }, { "epoch": 6.288307599557893, "grad_norm": 30.285083770751953, "learning_rate": 1.859740262464549e-05, "loss": 2.4218, "step": 3004000 }, { "epoch": 6.289354255283518, "grad_norm": 22.30087661743164, "learning_rate": 1.8592158368155196e-05, "loss": 2.4433, "step": 3004500 }, { "epoch": 6.290400911009144, "grad_norm": 16.430131912231445, "learning_rate": 1.8586914111664904e-05, "loss": 2.4445, "step": 3005000 }, { "epoch": 6.291447566734769, "grad_norm": 15.946203231811523, "learning_rate": 1.8581669855174615e-05, "loss": 2.4505, "step": 3005500 }, { "epoch": 6.292494222460395, "grad_norm": 16.216934204101562, "learning_rate": 1.8576425598684323e-05, "loss": 2.4175, "step": 3006000 }, { "epoch": 6.29354087818602, "grad_norm": 20.65506362915039, "learning_rate": 1.857118134219403e-05, "loss": 2.4342, "step": 3006500 }, { "epoch": 6.2945875339116455, "grad_norm": 16.96569061279297, "learning_rate": 1.8565937085703738e-05, "loss": 2.4508, "step": 3007000 }, { "epoch": 6.295634189637271, "grad_norm": 19.18994140625, "learning_rate": 1.8560692829213446e-05, "loss": 2.4387, "step": 3007500 }, { "epoch": 6.296680845362896, "grad_norm": 17.036582946777344, "learning_rate": 1.8555448572723157e-05, "loss": 2.4188, "step": 3008000 }, { "epoch": 6.297727501088522, "grad_norm": 19.04331398010254, "learning_rate": 1.8550204316232864e-05, "loss": 2.435, "step": 3008500 }, { "epoch": 6.298774156814147, "grad_norm": 16.959455490112305, "learning_rate": 1.8544960059742572e-05, "loss": 2.4368, "step": 3009000 }, { "epoch": 6.299820812539773, "grad_norm": 15.316200256347656, "learning_rate": 1.8539715803252276e-05, "loss": 2.4167, "step": 3009500 }, { "epoch": 6.300867468265398, "grad_norm": 15.080986976623535, "learning_rate": 1.8534471546761988e-05, "loss": 2.4406, "step": 3010000 }, { "epoch": 6.301914123991024, "grad_norm": 15.503693580627441, "learning_rate": 1.8529227290271695e-05, "loss": 2.4093, "step": 3010500 }, { "epoch": 6.302960779716649, "grad_norm": 18.02195930480957, "learning_rate": 1.8523983033781403e-05, "loss": 2.423, "step": 3011000 }, { "epoch": 6.304007435442275, "grad_norm": 15.137564659118652, "learning_rate": 1.851873877729111e-05, "loss": 2.43, "step": 3011500 }, { "epoch": 6.3050540911679, "grad_norm": 18.088687896728516, "learning_rate": 1.8513494520800818e-05, "loss": 2.4467, "step": 3012000 }, { "epoch": 6.3061007468935255, "grad_norm": 15.067856788635254, "learning_rate": 1.850825026431053e-05, "loss": 2.42, "step": 3012500 }, { "epoch": 6.307147402619151, "grad_norm": 16.735185623168945, "learning_rate": 1.8503006007820237e-05, "loss": 2.4166, "step": 3013000 }, { "epoch": 6.308194058344776, "grad_norm": 17.08036231994629, "learning_rate": 1.8497761751329945e-05, "loss": 2.4324, "step": 3013500 }, { "epoch": 6.309240714070402, "grad_norm": 18.50482177734375, "learning_rate": 1.8492517494839652e-05, "loss": 2.4072, "step": 3014000 }, { "epoch": 6.310287369796027, "grad_norm": 15.495404243469238, "learning_rate": 1.8487273238349364e-05, "loss": 2.4212, "step": 3014500 }, { "epoch": 6.311334025521653, "grad_norm": 19.276979446411133, "learning_rate": 1.8482028981859068e-05, "loss": 2.4202, "step": 3015000 }, { "epoch": 6.312380681247278, "grad_norm": 16.758968353271484, "learning_rate": 1.8476784725368776e-05, "loss": 2.4207, "step": 3015500 }, { "epoch": 6.313427336972905, "grad_norm": 20.521133422851562, "learning_rate": 1.8471540468878483e-05, "loss": 2.422, "step": 3016000 }, { "epoch": 6.31447399269853, "grad_norm": 15.683266639709473, "learning_rate": 1.8466296212388194e-05, "loss": 2.4253, "step": 3016500 }, { "epoch": 6.3155206484241555, "grad_norm": 15.704612731933594, "learning_rate": 1.8461051955897902e-05, "loss": 2.4079, "step": 3017000 }, { "epoch": 6.316567304149781, "grad_norm": 16.59476661682129, "learning_rate": 1.845580769940761e-05, "loss": 2.4268, "step": 3017500 }, { "epoch": 6.317613959875406, "grad_norm": 15.105562210083008, "learning_rate": 1.8450563442917317e-05, "loss": 2.4371, "step": 3018000 }, { "epoch": 6.318660615601032, "grad_norm": 14.515127182006836, "learning_rate": 1.8445319186427025e-05, "loss": 2.4181, "step": 3018500 }, { "epoch": 6.319707271326657, "grad_norm": 19.247631072998047, "learning_rate": 1.8440074929936736e-05, "loss": 2.4351, "step": 3019000 }, { "epoch": 6.320753927052283, "grad_norm": 14.596039772033691, "learning_rate": 1.8434830673446444e-05, "loss": 2.4465, "step": 3019500 }, { "epoch": 6.321800582777908, "grad_norm": 23.2047119140625, "learning_rate": 1.842958641695615e-05, "loss": 2.4268, "step": 3020000 }, { "epoch": 6.322847238503534, "grad_norm": 16.226463317871094, "learning_rate": 1.842434216046586e-05, "loss": 2.4326, "step": 3020500 }, { "epoch": 6.323893894229159, "grad_norm": 14.008218765258789, "learning_rate": 1.8419097903975567e-05, "loss": 2.4287, "step": 3021000 }, { "epoch": 6.324940549954785, "grad_norm": 16.093746185302734, "learning_rate": 1.8413853647485275e-05, "loss": 2.4248, "step": 3021500 }, { "epoch": 6.32598720568041, "grad_norm": 17.52323341369629, "learning_rate": 1.8408609390994982e-05, "loss": 2.4143, "step": 3022000 }, { "epoch": 6.3270338614060355, "grad_norm": 30.94215202331543, "learning_rate": 1.840336513450469e-05, "loss": 2.419, "step": 3022500 }, { "epoch": 6.328080517131661, "grad_norm": 18.91297721862793, "learning_rate": 1.8398120878014398e-05, "loss": 2.4177, "step": 3023000 }, { "epoch": 6.329127172857286, "grad_norm": 16.637577056884766, "learning_rate": 1.839287662152411e-05, "loss": 2.4329, "step": 3023500 }, { "epoch": 6.330173828582912, "grad_norm": 21.73956298828125, "learning_rate": 1.8387632365033816e-05, "loss": 2.416, "step": 3024000 }, { "epoch": 6.331220484308537, "grad_norm": 19.620275497436523, "learning_rate": 1.8382388108543524e-05, "loss": 2.4351, "step": 3024500 }, { "epoch": 6.332267140034163, "grad_norm": 16.264387130737305, "learning_rate": 1.8377143852053232e-05, "loss": 2.4494, "step": 3025000 }, { "epoch": 6.333313795759788, "grad_norm": 16.611141204833984, "learning_rate": 1.8371899595562943e-05, "loss": 2.4262, "step": 3025500 }, { "epoch": 6.334360451485414, "grad_norm": 16.074085235595703, "learning_rate": 1.836665533907265e-05, "loss": 2.4455, "step": 3026000 }, { "epoch": 6.335407107211039, "grad_norm": 17.092546463012695, "learning_rate": 1.8361411082582355e-05, "loss": 2.4277, "step": 3026500 }, { "epoch": 6.336453762936665, "grad_norm": 18.561031341552734, "learning_rate": 1.8356166826092063e-05, "loss": 2.4355, "step": 3027000 }, { "epoch": 6.33750041866229, "grad_norm": 16.458175659179688, "learning_rate": 1.8350922569601774e-05, "loss": 2.4273, "step": 3027500 }, { "epoch": 6.3385470743879155, "grad_norm": 17.733720779418945, "learning_rate": 1.834567831311148e-05, "loss": 2.3896, "step": 3028000 }, { "epoch": 6.339593730113541, "grad_norm": 18.194133758544922, "learning_rate": 1.834043405662119e-05, "loss": 2.4215, "step": 3028500 }, { "epoch": 6.3406403858391664, "grad_norm": 19.53379249572754, "learning_rate": 1.8335189800130897e-05, "loss": 2.4263, "step": 3029000 }, { "epoch": 6.341687041564792, "grad_norm": 17.285076141357422, "learning_rate": 1.8329945543640604e-05, "loss": 2.4393, "step": 3029500 }, { "epoch": 6.342733697290417, "grad_norm": 15.839853286743164, "learning_rate": 1.8324701287150316e-05, "loss": 2.4341, "step": 3030000 }, { "epoch": 6.343780353016043, "grad_norm": 13.92209243774414, "learning_rate": 1.8319457030660023e-05, "loss": 2.4291, "step": 3030500 }, { "epoch": 6.344827008741668, "grad_norm": 16.11641502380371, "learning_rate": 1.831421277416973e-05, "loss": 2.433, "step": 3031000 }, { "epoch": 6.345873664467294, "grad_norm": 19.880674362182617, "learning_rate": 1.830896851767944e-05, "loss": 2.4155, "step": 3031500 }, { "epoch": 6.346920320192919, "grad_norm": 15.073749542236328, "learning_rate": 1.8303724261189146e-05, "loss": 2.4295, "step": 3032000 }, { "epoch": 6.3479669759185455, "grad_norm": 18.635744094848633, "learning_rate": 1.8298480004698854e-05, "loss": 2.4249, "step": 3032500 }, { "epoch": 6.349013631644171, "grad_norm": 16.9731502532959, "learning_rate": 1.829323574820856e-05, "loss": 2.4243, "step": 3033000 }, { "epoch": 6.350060287369796, "grad_norm": 17.680137634277344, "learning_rate": 1.828799149171827e-05, "loss": 2.4138, "step": 3033500 }, { "epoch": 6.351106943095422, "grad_norm": 15.017120361328125, "learning_rate": 1.828274723522798e-05, "loss": 2.4298, "step": 3034000 }, { "epoch": 6.352153598821047, "grad_norm": 18.439720153808594, "learning_rate": 1.8277502978737688e-05, "loss": 2.4214, "step": 3034500 }, { "epoch": 6.353200254546673, "grad_norm": 15.939119338989258, "learning_rate": 1.8272258722247396e-05, "loss": 2.4219, "step": 3035000 }, { "epoch": 6.354246910272298, "grad_norm": 16.752769470214844, "learning_rate": 1.8267014465757103e-05, "loss": 2.4434, "step": 3035500 }, { "epoch": 6.355293565997924, "grad_norm": 19.50714683532715, "learning_rate": 1.826177020926681e-05, "loss": 2.4324, "step": 3036000 }, { "epoch": 6.356340221723549, "grad_norm": 16.769147872924805, "learning_rate": 1.8256525952776522e-05, "loss": 2.4481, "step": 3036500 }, { "epoch": 6.357386877449175, "grad_norm": 18.147457122802734, "learning_rate": 1.825128169628623e-05, "loss": 2.4273, "step": 3037000 }, { "epoch": 6.3584335331748, "grad_norm": 19.125322341918945, "learning_rate": 1.8246037439795938e-05, "loss": 2.4336, "step": 3037500 }, { "epoch": 6.3594801889004255, "grad_norm": 17.492345809936523, "learning_rate": 1.8240793183305642e-05, "loss": 2.4068, "step": 3038000 }, { "epoch": 6.360526844626051, "grad_norm": 16.385984420776367, "learning_rate": 1.8235548926815353e-05, "loss": 2.4214, "step": 3038500 }, { "epoch": 6.3615735003516765, "grad_norm": 15.920267105102539, "learning_rate": 1.823030467032506e-05, "loss": 2.4169, "step": 3039000 }, { "epoch": 6.362620156077302, "grad_norm": 18.46404457092285, "learning_rate": 1.822506041383477e-05, "loss": 2.4413, "step": 3039500 }, { "epoch": 6.363666811802927, "grad_norm": 16.578990936279297, "learning_rate": 1.8219816157344476e-05, "loss": 2.4155, "step": 3040000 }, { "epoch": 6.364713467528553, "grad_norm": 19.869983673095703, "learning_rate": 1.8214571900854184e-05, "loss": 2.415, "step": 3040500 }, { "epoch": 6.365760123254178, "grad_norm": 15.669687271118164, "learning_rate": 1.8209327644363895e-05, "loss": 2.4359, "step": 3041000 }, { "epoch": 6.366806778979804, "grad_norm": 13.968551635742188, "learning_rate": 1.8204083387873603e-05, "loss": 2.4228, "step": 3041500 }, { "epoch": 6.367853434705429, "grad_norm": 13.306605339050293, "learning_rate": 1.819883913138331e-05, "loss": 2.419, "step": 3042000 }, { "epoch": 6.368900090431055, "grad_norm": 16.56558609008789, "learning_rate": 1.8193594874893018e-05, "loss": 2.4291, "step": 3042500 }, { "epoch": 6.36994674615668, "grad_norm": 17.40017318725586, "learning_rate": 1.818835061840273e-05, "loss": 2.4209, "step": 3043000 }, { "epoch": 6.370993401882306, "grad_norm": 18.265180587768555, "learning_rate": 1.8183106361912433e-05, "loss": 2.4321, "step": 3043500 }, { "epoch": 6.372040057607931, "grad_norm": 16.275224685668945, "learning_rate": 1.817786210542214e-05, "loss": 2.4151, "step": 3044000 }, { "epoch": 6.3730867133335565, "grad_norm": 19.63668441772461, "learning_rate": 1.817261784893185e-05, "loss": 2.4296, "step": 3044500 }, { "epoch": 6.374133369059182, "grad_norm": 15.196435928344727, "learning_rate": 1.816737359244156e-05, "loss": 2.423, "step": 3045000 }, { "epoch": 6.375180024784807, "grad_norm": 17.97781753540039, "learning_rate": 1.8162129335951267e-05, "loss": 2.425, "step": 3045500 }, { "epoch": 6.376226680510433, "grad_norm": 18.270734786987305, "learning_rate": 1.8156885079460975e-05, "loss": 2.4266, "step": 3046000 }, { "epoch": 6.377273336236058, "grad_norm": 19.622095108032227, "learning_rate": 1.8151640822970683e-05, "loss": 2.4363, "step": 3046500 }, { "epoch": 6.378319991961684, "grad_norm": 15.64818000793457, "learning_rate": 1.814639656648039e-05, "loss": 2.4287, "step": 3047000 }, { "epoch": 6.379366647687309, "grad_norm": 18.120925903320312, "learning_rate": 1.81411523099901e-05, "loss": 2.4103, "step": 3047500 }, { "epoch": 6.380413303412935, "grad_norm": 15.635915756225586, "learning_rate": 1.813590805349981e-05, "loss": 2.4238, "step": 3048000 }, { "epoch": 6.38145995913856, "grad_norm": 14.71761703491211, "learning_rate": 1.8130663797009517e-05, "loss": 2.4135, "step": 3048500 }, { "epoch": 6.382506614864186, "grad_norm": 17.58026695251465, "learning_rate": 1.812541954051922e-05, "loss": 2.4386, "step": 3049000 }, { "epoch": 6.383553270589811, "grad_norm": 17.450767517089844, "learning_rate": 1.8120175284028932e-05, "loss": 2.405, "step": 3049500 }, { "epoch": 6.3845999263154365, "grad_norm": 16.138458251953125, "learning_rate": 1.811493102753864e-05, "loss": 2.4311, "step": 3050000 }, { "epoch": 6.385646582041062, "grad_norm": 19.543384552001953, "learning_rate": 1.8109686771048348e-05, "loss": 2.4312, "step": 3050500 }, { "epoch": 6.386693237766688, "grad_norm": 20.454662322998047, "learning_rate": 1.8104442514558055e-05, "loss": 2.4121, "step": 3051000 }, { "epoch": 6.387739893492314, "grad_norm": 18.06150245666504, "learning_rate": 1.8099198258067767e-05, "loss": 2.4183, "step": 3051500 }, { "epoch": 6.388786549217939, "grad_norm": 17.98105812072754, "learning_rate": 1.8093954001577474e-05, "loss": 2.4328, "step": 3052000 }, { "epoch": 6.389833204943565, "grad_norm": 17.371793746948242, "learning_rate": 1.8088709745087182e-05, "loss": 2.4051, "step": 3052500 }, { "epoch": 6.39087986066919, "grad_norm": 23.237838745117188, "learning_rate": 1.808346548859689e-05, "loss": 2.4254, "step": 3053000 }, { "epoch": 6.391926516394816, "grad_norm": 18.76890754699707, "learning_rate": 1.8078221232106597e-05, "loss": 2.4229, "step": 3053500 }, { "epoch": 6.392973172120441, "grad_norm": 16.707901000976562, "learning_rate": 1.807297697561631e-05, "loss": 2.4296, "step": 3054000 }, { "epoch": 6.3940198278460665, "grad_norm": 19.120988845825195, "learning_rate": 1.8067732719126016e-05, "loss": 2.4215, "step": 3054500 }, { "epoch": 6.395066483571692, "grad_norm": 16.226205825805664, "learning_rate": 1.806248846263572e-05, "loss": 2.4275, "step": 3055000 }, { "epoch": 6.396113139297317, "grad_norm": 17.742752075195312, "learning_rate": 1.8057244206145428e-05, "loss": 2.4515, "step": 3055500 }, { "epoch": 6.397159795022943, "grad_norm": 17.63732147216797, "learning_rate": 1.805199994965514e-05, "loss": 2.4152, "step": 3056000 }, { "epoch": 6.398206450748568, "grad_norm": 20.680206298828125, "learning_rate": 1.8046755693164847e-05, "loss": 2.4181, "step": 3056500 }, { "epoch": 6.399253106474194, "grad_norm": 17.16114616394043, "learning_rate": 1.8041511436674555e-05, "loss": 2.4157, "step": 3057000 }, { "epoch": 6.400299762199819, "grad_norm": 16.124818801879883, "learning_rate": 1.8036267180184262e-05, "loss": 2.4232, "step": 3057500 }, { "epoch": 6.401346417925445, "grad_norm": 16.672189712524414, "learning_rate": 1.803102292369397e-05, "loss": 2.4158, "step": 3058000 }, { "epoch": 6.40239307365107, "grad_norm": 15.996792793273926, "learning_rate": 1.802577866720368e-05, "loss": 2.4252, "step": 3058500 }, { "epoch": 6.403439729376696, "grad_norm": 16.424436569213867, "learning_rate": 1.802053441071339e-05, "loss": 2.4089, "step": 3059000 }, { "epoch": 6.404486385102321, "grad_norm": 17.13212013244629, "learning_rate": 1.8015290154223096e-05, "loss": 2.4178, "step": 3059500 }, { "epoch": 6.4055330408279465, "grad_norm": 18.786840438842773, "learning_rate": 1.8010045897732804e-05, "loss": 2.428, "step": 3060000 }, { "epoch": 6.406579696553572, "grad_norm": 19.33573341369629, "learning_rate": 1.8004801641242512e-05, "loss": 2.4384, "step": 3060500 }, { "epoch": 6.407626352279197, "grad_norm": 20.068424224853516, "learning_rate": 1.799955738475222e-05, "loss": 2.4076, "step": 3061000 }, { "epoch": 6.408673008004823, "grad_norm": 19.25596809387207, "learning_rate": 1.7994313128261927e-05, "loss": 2.428, "step": 3061500 }, { "epoch": 6.409719663730448, "grad_norm": 17.41594696044922, "learning_rate": 1.7989068871771635e-05, "loss": 2.4119, "step": 3062000 }, { "epoch": 6.410766319456074, "grad_norm": 14.827215194702148, "learning_rate": 1.7983824615281346e-05, "loss": 2.4047, "step": 3062500 }, { "epoch": 6.411812975181699, "grad_norm": 19.360910415649414, "learning_rate": 1.7978580358791054e-05, "loss": 2.4162, "step": 3063000 }, { "epoch": 6.412859630907325, "grad_norm": 18.197654724121094, "learning_rate": 1.797333610230076e-05, "loss": 2.4231, "step": 3063500 }, { "epoch": 6.41390628663295, "grad_norm": 17.02173614501953, "learning_rate": 1.796809184581047e-05, "loss": 2.3999, "step": 3064000 }, { "epoch": 6.414952942358576, "grad_norm": 15.914484977722168, "learning_rate": 1.7962847589320177e-05, "loss": 2.4229, "step": 3064500 }, { "epoch": 6.415999598084201, "grad_norm": 15.309834480285645, "learning_rate": 1.7957603332829888e-05, "loss": 2.4239, "step": 3065000 }, { "epoch": 6.4170462538098265, "grad_norm": 18.1566219329834, "learning_rate": 1.7952359076339595e-05, "loss": 2.4565, "step": 3065500 }, { "epoch": 6.418092909535452, "grad_norm": 15.81182861328125, "learning_rate": 1.79471148198493e-05, "loss": 2.4238, "step": 3066000 }, { "epoch": 6.419139565261077, "grad_norm": 17.484872817993164, "learning_rate": 1.7941870563359007e-05, "loss": 2.4204, "step": 3066500 }, { "epoch": 6.420186220986703, "grad_norm": 16.4344539642334, "learning_rate": 1.793662630686872e-05, "loss": 2.4352, "step": 3067000 }, { "epoch": 6.421232876712329, "grad_norm": 16.288164138793945, "learning_rate": 1.7931382050378426e-05, "loss": 2.4062, "step": 3067500 }, { "epoch": 6.422279532437955, "grad_norm": 14.012053489685059, "learning_rate": 1.7926137793888134e-05, "loss": 2.414, "step": 3068000 }, { "epoch": 6.42332618816358, "grad_norm": 16.91411590576172, "learning_rate": 1.792089353739784e-05, "loss": 2.4204, "step": 3068500 }, { "epoch": 6.424372843889206, "grad_norm": 15.34298038482666, "learning_rate": 1.7915649280907553e-05, "loss": 2.4264, "step": 3069000 }, { "epoch": 6.425419499614831, "grad_norm": 15.408172607421875, "learning_rate": 1.791040502441726e-05, "loss": 2.3992, "step": 3069500 }, { "epoch": 6.4264661553404565, "grad_norm": 17.11248779296875, "learning_rate": 1.7905160767926968e-05, "loss": 2.4293, "step": 3070000 }, { "epoch": 6.427512811066082, "grad_norm": 13.784693717956543, "learning_rate": 1.7899916511436676e-05, "loss": 2.4149, "step": 3070500 }, { "epoch": 6.428559466791707, "grad_norm": 18.909069061279297, "learning_rate": 1.7894672254946383e-05, "loss": 2.4457, "step": 3071000 }, { "epoch": 6.429606122517333, "grad_norm": 14.89365291595459, "learning_rate": 1.7889427998456095e-05, "loss": 2.421, "step": 3071500 }, { "epoch": 6.430652778242958, "grad_norm": 16.133386611938477, "learning_rate": 1.78841837419658e-05, "loss": 2.4366, "step": 3072000 }, { "epoch": 6.431699433968584, "grad_norm": 14.068286895751953, "learning_rate": 1.7878939485475507e-05, "loss": 2.3937, "step": 3072500 }, { "epoch": 6.432746089694209, "grad_norm": 18.139434814453125, "learning_rate": 1.7873695228985214e-05, "loss": 2.4272, "step": 3073000 }, { "epoch": 6.433792745419835, "grad_norm": 20.26055908203125, "learning_rate": 1.7868450972494925e-05, "loss": 2.4138, "step": 3073500 }, { "epoch": 6.43483940114546, "grad_norm": 16.77314567565918, "learning_rate": 1.7863206716004633e-05, "loss": 2.429, "step": 3074000 }, { "epoch": 6.435886056871086, "grad_norm": 21.386974334716797, "learning_rate": 1.785796245951434e-05, "loss": 2.3967, "step": 3074500 }, { "epoch": 6.436932712596711, "grad_norm": 17.754484176635742, "learning_rate": 1.785271820302405e-05, "loss": 2.4036, "step": 3075000 }, { "epoch": 6.4379793683223365, "grad_norm": 13.680587768554688, "learning_rate": 1.7847473946533756e-05, "loss": 2.414, "step": 3075500 }, { "epoch": 6.439026024047962, "grad_norm": 16.899452209472656, "learning_rate": 1.7842229690043467e-05, "loss": 2.4277, "step": 3076000 }, { "epoch": 6.440072679773587, "grad_norm": 17.54622459411621, "learning_rate": 1.7836985433553175e-05, "loss": 2.4304, "step": 3076500 }, { "epoch": 6.441119335499213, "grad_norm": 14.730923652648926, "learning_rate": 1.7831741177062883e-05, "loss": 2.4245, "step": 3077000 }, { "epoch": 6.442165991224838, "grad_norm": 18.602672576904297, "learning_rate": 1.7826496920572587e-05, "loss": 2.4151, "step": 3077500 }, { "epoch": 6.443212646950464, "grad_norm": 15.440065383911133, "learning_rate": 1.7821252664082298e-05, "loss": 2.3988, "step": 3078000 }, { "epoch": 6.444259302676089, "grad_norm": 16.411701202392578, "learning_rate": 1.7816008407592006e-05, "loss": 2.4299, "step": 3078500 }, { "epoch": 6.445305958401715, "grad_norm": 18.758625030517578, "learning_rate": 1.7810764151101713e-05, "loss": 2.4143, "step": 3079000 }, { "epoch": 6.44635261412734, "grad_norm": 15.19541072845459, "learning_rate": 1.780551989461142e-05, "loss": 2.4343, "step": 3079500 }, { "epoch": 6.447399269852966, "grad_norm": 21.996477127075195, "learning_rate": 1.7800275638121132e-05, "loss": 2.4442, "step": 3080000 }, { "epoch": 6.448445925578591, "grad_norm": 18.359037399291992, "learning_rate": 1.779503138163084e-05, "loss": 2.4397, "step": 3080500 }, { "epoch": 6.4494925813042165, "grad_norm": 15.829811096191406, "learning_rate": 1.7789787125140547e-05, "loss": 2.4066, "step": 3081000 }, { "epoch": 6.450539237029842, "grad_norm": 17.24616241455078, "learning_rate": 1.7784542868650255e-05, "loss": 2.4223, "step": 3081500 }, { "epoch": 6.4515858927554675, "grad_norm": 17.948030471801758, "learning_rate": 1.7779298612159963e-05, "loss": 2.427, "step": 3082000 }, { "epoch": 6.452632548481093, "grad_norm": 17.515777587890625, "learning_rate": 1.7774054355669674e-05, "loss": 2.4213, "step": 3082500 }, { "epoch": 6.453679204206718, "grad_norm": 14.507498741149902, "learning_rate": 1.7768810099179378e-05, "loss": 2.421, "step": 3083000 }, { "epoch": 6.454725859932344, "grad_norm": 15.781966209411621, "learning_rate": 1.7763565842689086e-05, "loss": 2.4324, "step": 3083500 }, { "epoch": 6.455772515657969, "grad_norm": 15.396688461303711, "learning_rate": 1.7758321586198794e-05, "loss": 2.4214, "step": 3084000 }, { "epoch": 6.456819171383595, "grad_norm": 18.122365951538086, "learning_rate": 1.7753077329708505e-05, "loss": 2.4259, "step": 3084500 }, { "epoch": 6.45786582710922, "grad_norm": 17.807889938354492, "learning_rate": 1.7747833073218212e-05, "loss": 2.4307, "step": 3085000 }, { "epoch": 6.458912482834846, "grad_norm": 20.11547088623047, "learning_rate": 1.774258881672792e-05, "loss": 2.4212, "step": 3085500 }, { "epoch": 6.459959138560472, "grad_norm": 16.957775115966797, "learning_rate": 1.7737344560237628e-05, "loss": 2.4107, "step": 3086000 }, { "epoch": 6.4610057942860974, "grad_norm": 17.01368522644043, "learning_rate": 1.7732100303747335e-05, "loss": 2.4239, "step": 3086500 }, { "epoch": 6.462052450011723, "grad_norm": 18.6083984375, "learning_rate": 1.7726856047257047e-05, "loss": 2.4248, "step": 3087000 }, { "epoch": 6.463099105737348, "grad_norm": 17.603635787963867, "learning_rate": 1.7721611790766754e-05, "loss": 2.4081, "step": 3087500 }, { "epoch": 6.464145761462974, "grad_norm": 17.728464126586914, "learning_rate": 1.7716367534276462e-05, "loss": 2.3997, "step": 3088000 }, { "epoch": 6.465192417188599, "grad_norm": 15.024717330932617, "learning_rate": 1.771112327778617e-05, "loss": 2.4073, "step": 3088500 }, { "epoch": 6.466239072914225, "grad_norm": 18.154342651367188, "learning_rate": 1.7705879021295877e-05, "loss": 2.4148, "step": 3089000 }, { "epoch": 6.46728572863985, "grad_norm": 15.33902359008789, "learning_rate": 1.7700634764805585e-05, "loss": 2.412, "step": 3089500 }, { "epoch": 6.468332384365476, "grad_norm": 22.29732894897461, "learning_rate": 1.7695390508315293e-05, "loss": 2.4112, "step": 3090000 }, { "epoch": 6.469379040091101, "grad_norm": 18.633394241333008, "learning_rate": 1.7690146251825e-05, "loss": 2.4193, "step": 3090500 }, { "epoch": 6.4704256958167266, "grad_norm": 13.775949478149414, "learning_rate": 1.768490199533471e-05, "loss": 2.3979, "step": 3091000 }, { "epoch": 6.471472351542352, "grad_norm": 23.189252853393555, "learning_rate": 1.767965773884442e-05, "loss": 2.4362, "step": 3091500 }, { "epoch": 6.4725190072679775, "grad_norm": 16.63471794128418, "learning_rate": 1.7674413482354127e-05, "loss": 2.4193, "step": 3092000 }, { "epoch": 6.473565662993603, "grad_norm": 18.844602584838867, "learning_rate": 1.7669169225863835e-05, "loss": 2.4318, "step": 3092500 }, { "epoch": 6.474612318719228, "grad_norm": 15.714128494262695, "learning_rate": 1.7663924969373542e-05, "loss": 2.4359, "step": 3093000 }, { "epoch": 6.475658974444854, "grad_norm": 15.281871795654297, "learning_rate": 1.7658680712883253e-05, "loss": 2.4125, "step": 3093500 }, { "epoch": 6.476705630170479, "grad_norm": 20.56157875061035, "learning_rate": 1.765343645639296e-05, "loss": 2.4178, "step": 3094000 }, { "epoch": 6.477752285896105, "grad_norm": 13.507817268371582, "learning_rate": 1.7648192199902665e-05, "loss": 2.4252, "step": 3094500 }, { "epoch": 6.47879894162173, "grad_norm": 14.996227264404297, "learning_rate": 1.7642947943412373e-05, "loss": 2.4144, "step": 3095000 }, { "epoch": 6.479845597347356, "grad_norm": 20.132919311523438, "learning_rate": 1.7637703686922084e-05, "loss": 2.4191, "step": 3095500 }, { "epoch": 6.480892253072981, "grad_norm": 15.760427474975586, "learning_rate": 1.7632459430431792e-05, "loss": 2.406, "step": 3096000 }, { "epoch": 6.481938908798607, "grad_norm": 14.947288513183594, "learning_rate": 1.76272151739415e-05, "loss": 2.429, "step": 3096500 }, { "epoch": 6.482985564524232, "grad_norm": 16.562376022338867, "learning_rate": 1.7621970917451207e-05, "loss": 2.4284, "step": 3097000 }, { "epoch": 6.4840322202498575, "grad_norm": 14.485369682312012, "learning_rate": 1.7616726660960918e-05, "loss": 2.4138, "step": 3097500 }, { "epoch": 6.485078875975483, "grad_norm": 14.414179801940918, "learning_rate": 1.7611482404470626e-05, "loss": 2.4087, "step": 3098000 }, { "epoch": 6.486125531701108, "grad_norm": 17.43213653564453, "learning_rate": 1.7606238147980334e-05, "loss": 2.4257, "step": 3098500 }, { "epoch": 6.487172187426734, "grad_norm": 16.04770851135254, "learning_rate": 1.760099389149004e-05, "loss": 2.4208, "step": 3099000 }, { "epoch": 6.488218843152359, "grad_norm": 14.144842147827148, "learning_rate": 1.759574963499975e-05, "loss": 2.425, "step": 3099500 }, { "epoch": 6.489265498877985, "grad_norm": 17.041364669799805, "learning_rate": 1.7590505378509457e-05, "loss": 2.4135, "step": 3100000 }, { "epoch": 6.49031215460361, "grad_norm": 16.98140525817871, "learning_rate": 1.7585261122019164e-05, "loss": 2.4153, "step": 3100500 }, { "epoch": 6.491358810329236, "grad_norm": 17.092777252197266, "learning_rate": 1.7580016865528872e-05, "loss": 2.4204, "step": 3101000 }, { "epoch": 6.492405466054861, "grad_norm": 16.97829246520996, "learning_rate": 1.757477260903858e-05, "loss": 2.4232, "step": 3101500 }, { "epoch": 6.493452121780487, "grad_norm": 18.294641494750977, "learning_rate": 1.756952835254829e-05, "loss": 2.4242, "step": 3102000 }, { "epoch": 6.494498777506113, "grad_norm": 18.104761123657227, "learning_rate": 1.7564284096058e-05, "loss": 2.4108, "step": 3102500 }, { "epoch": 6.495545433231738, "grad_norm": 17.696861267089844, "learning_rate": 1.7559039839567706e-05, "loss": 2.4327, "step": 3103000 }, { "epoch": 6.496592088957364, "grad_norm": 18.402568817138672, "learning_rate": 1.7553795583077414e-05, "loss": 2.4161, "step": 3103500 }, { "epoch": 6.497638744682989, "grad_norm": 17.37188720703125, "learning_rate": 1.754855132658712e-05, "loss": 2.4084, "step": 3104000 }, { "epoch": 6.498685400408615, "grad_norm": 19.97467613220215, "learning_rate": 1.7543307070096833e-05, "loss": 2.4221, "step": 3104500 }, { "epoch": 6.49973205613424, "grad_norm": 19.935705184936523, "learning_rate": 1.753806281360654e-05, "loss": 2.4297, "step": 3105000 }, { "epoch": 6.500778711859866, "grad_norm": 19.845096588134766, "learning_rate": 1.7532818557116248e-05, "loss": 2.4234, "step": 3105500 }, { "epoch": 6.501825367585491, "grad_norm": 17.8729248046875, "learning_rate": 1.7527574300625956e-05, "loss": 2.4208, "step": 3106000 }, { "epoch": 6.502872023311117, "grad_norm": 17.172788619995117, "learning_rate": 1.7522330044135663e-05, "loss": 2.4259, "step": 3106500 }, { "epoch": 6.503918679036742, "grad_norm": 15.258003234863281, "learning_rate": 1.751708578764537e-05, "loss": 2.4172, "step": 3107000 }, { "epoch": 6.5049653347623675, "grad_norm": 18.829814910888672, "learning_rate": 1.751184153115508e-05, "loss": 2.4403, "step": 3107500 }, { "epoch": 6.506011990487993, "grad_norm": 24.11675262451172, "learning_rate": 1.7506597274664786e-05, "loss": 2.4105, "step": 3108000 }, { "epoch": 6.507058646213618, "grad_norm": 20.01633071899414, "learning_rate": 1.7501353018174498e-05, "loss": 2.4235, "step": 3108500 }, { "epoch": 6.508105301939244, "grad_norm": 17.167911529541016, "learning_rate": 1.7496108761684205e-05, "loss": 2.4356, "step": 3109000 }, { "epoch": 6.509151957664869, "grad_norm": 19.964630126953125, "learning_rate": 1.7490864505193913e-05, "loss": 2.4043, "step": 3109500 }, { "epoch": 6.510198613390495, "grad_norm": 21.784223556518555, "learning_rate": 1.748562024870362e-05, "loss": 2.4191, "step": 3110000 }, { "epoch": 6.51124526911612, "grad_norm": 18.161535263061523, "learning_rate": 1.748037599221333e-05, "loss": 2.4157, "step": 3110500 }, { "epoch": 6.512291924841746, "grad_norm": 16.6147403717041, "learning_rate": 1.747513173572304e-05, "loss": 2.3999, "step": 3111000 }, { "epoch": 6.513338580567371, "grad_norm": 18.73662757873535, "learning_rate": 1.7469887479232744e-05, "loss": 2.4135, "step": 3111500 }, { "epoch": 6.514385236292997, "grad_norm": 17.883886337280273, "learning_rate": 1.746464322274245e-05, "loss": 2.412, "step": 3112000 }, { "epoch": 6.515431892018622, "grad_norm": 17.977542877197266, "learning_rate": 1.745939896625216e-05, "loss": 2.4359, "step": 3112500 }, { "epoch": 6.5164785477442475, "grad_norm": 16.54155921936035, "learning_rate": 1.745415470976187e-05, "loss": 2.4154, "step": 3113000 }, { "epoch": 6.517525203469873, "grad_norm": 18.40481948852539, "learning_rate": 1.7448910453271578e-05, "loss": 2.4204, "step": 3113500 }, { "epoch": 6.518571859195498, "grad_norm": 17.083375930786133, "learning_rate": 1.7443666196781286e-05, "loss": 2.4116, "step": 3114000 }, { "epoch": 6.519618514921124, "grad_norm": 16.424100875854492, "learning_rate": 1.7438421940290993e-05, "loss": 2.4178, "step": 3114500 }, { "epoch": 6.520665170646749, "grad_norm": 17.625198364257812, "learning_rate": 1.7433177683800704e-05, "loss": 2.4337, "step": 3115000 }, { "epoch": 6.521711826372375, "grad_norm": 16.83589744567871, "learning_rate": 1.7427933427310412e-05, "loss": 2.4029, "step": 3115500 }, { "epoch": 6.522758482098, "grad_norm": 14.691577911376953, "learning_rate": 1.742268917082012e-05, "loss": 2.4295, "step": 3116000 }, { "epoch": 6.523805137823626, "grad_norm": 16.71670913696289, "learning_rate": 1.7417444914329827e-05, "loss": 2.4202, "step": 3116500 }, { "epoch": 6.524851793549251, "grad_norm": 16.77362823486328, "learning_rate": 1.7412200657839535e-05, "loss": 2.4308, "step": 3117000 }, { "epoch": 6.525898449274877, "grad_norm": 14.963160514831543, "learning_rate": 1.7406956401349243e-05, "loss": 2.4246, "step": 3117500 }, { "epoch": 6.526945105000502, "grad_norm": 20.18146514892578, "learning_rate": 1.740171214485895e-05, "loss": 2.4321, "step": 3118000 }, { "epoch": 6.5279917607261275, "grad_norm": 17.87201690673828, "learning_rate": 1.7396467888368658e-05, "loss": 2.4067, "step": 3118500 }, { "epoch": 6.529038416451753, "grad_norm": 16.380258560180664, "learning_rate": 1.7391223631878366e-05, "loss": 2.3988, "step": 3119000 }, { "epoch": 6.530085072177378, "grad_norm": 15.288487434387207, "learning_rate": 1.7385979375388077e-05, "loss": 2.4139, "step": 3119500 }, { "epoch": 6.531131727903004, "grad_norm": 20.67608642578125, "learning_rate": 1.7380735118897785e-05, "loss": 2.4221, "step": 3120000 }, { "epoch": 6.532178383628629, "grad_norm": 19.251365661621094, "learning_rate": 1.7375490862407492e-05, "loss": 2.4023, "step": 3120500 }, { "epoch": 6.533225039354255, "grad_norm": 17.076566696166992, "learning_rate": 1.73702466059172e-05, "loss": 2.4237, "step": 3121000 }, { "epoch": 6.534271695079881, "grad_norm": 13.571259498596191, "learning_rate": 1.7365002349426908e-05, "loss": 2.4287, "step": 3121500 }, { "epoch": 6.535318350805507, "grad_norm": 14.464471817016602, "learning_rate": 1.735975809293662e-05, "loss": 2.4322, "step": 3122000 }, { "epoch": 6.536365006531132, "grad_norm": 17.169326782226562, "learning_rate": 1.7354513836446326e-05, "loss": 2.4271, "step": 3122500 }, { "epoch": 6.5374116622567575, "grad_norm": 15.402045249938965, "learning_rate": 1.734926957995603e-05, "loss": 2.4113, "step": 3123000 }, { "epoch": 6.538458317982383, "grad_norm": 18.833694458007812, "learning_rate": 1.7344025323465742e-05, "loss": 2.429, "step": 3123500 }, { "epoch": 6.539504973708008, "grad_norm": 18.992816925048828, "learning_rate": 1.733878106697545e-05, "loss": 2.4238, "step": 3124000 }, { "epoch": 6.540551629433634, "grad_norm": 16.03497314453125, "learning_rate": 1.7333536810485157e-05, "loss": 2.4297, "step": 3124500 }, { "epoch": 6.541598285159259, "grad_norm": 16.275991439819336, "learning_rate": 1.7328292553994865e-05, "loss": 2.4244, "step": 3125000 }, { "epoch": 6.542644940884885, "grad_norm": 16.9879093170166, "learning_rate": 1.7323048297504573e-05, "loss": 2.4303, "step": 3125500 }, { "epoch": 6.54369159661051, "grad_norm": 16.914321899414062, "learning_rate": 1.7317804041014284e-05, "loss": 2.3965, "step": 3126000 }, { "epoch": 6.544738252336136, "grad_norm": 15.961156845092773, "learning_rate": 1.731255978452399e-05, "loss": 2.4351, "step": 3126500 }, { "epoch": 6.545784908061761, "grad_norm": 18.79904556274414, "learning_rate": 1.73073155280337e-05, "loss": 2.4009, "step": 3127000 }, { "epoch": 6.546831563787387, "grad_norm": 20.9256591796875, "learning_rate": 1.7302071271543407e-05, "loss": 2.3796, "step": 3127500 }, { "epoch": 6.547878219513012, "grad_norm": 20.443069458007812, "learning_rate": 1.7296827015053114e-05, "loss": 2.4192, "step": 3128000 }, { "epoch": 6.5489248752386375, "grad_norm": 20.04672622680664, "learning_rate": 1.7291582758562822e-05, "loss": 2.41, "step": 3128500 }, { "epoch": 6.549971530964263, "grad_norm": 14.468134880065918, "learning_rate": 1.728633850207253e-05, "loss": 2.4089, "step": 3129000 }, { "epoch": 6.5510181866898884, "grad_norm": 17.750288009643555, "learning_rate": 1.7281094245582238e-05, "loss": 2.4205, "step": 3129500 }, { "epoch": 6.552064842415514, "grad_norm": 15.877403259277344, "learning_rate": 1.7275849989091945e-05, "loss": 2.4312, "step": 3130000 }, { "epoch": 6.553111498141139, "grad_norm": 18.23725128173828, "learning_rate": 1.7270605732601656e-05, "loss": 2.4158, "step": 3130500 }, { "epoch": 6.554158153866765, "grad_norm": 17.493640899658203, "learning_rate": 1.7265361476111364e-05, "loss": 2.4206, "step": 3131000 }, { "epoch": 6.55520480959239, "grad_norm": 18.709293365478516, "learning_rate": 1.726011721962107e-05, "loss": 2.4197, "step": 3131500 }, { "epoch": 6.556251465318016, "grad_norm": 14.803857803344727, "learning_rate": 1.725487296313078e-05, "loss": 2.4071, "step": 3132000 }, { "epoch": 6.557298121043641, "grad_norm": 15.063674926757812, "learning_rate": 1.724962870664049e-05, "loss": 2.4167, "step": 3132500 }, { "epoch": 6.558344776769267, "grad_norm": 16.27007484436035, "learning_rate": 1.7244384450150198e-05, "loss": 2.4067, "step": 3133000 }, { "epoch": 6.559391432494892, "grad_norm": 20.467273712158203, "learning_rate": 1.7239140193659906e-05, "loss": 2.4212, "step": 3133500 }, { "epoch": 6.5604380882205175, "grad_norm": 16.40850257873535, "learning_rate": 1.7233895937169614e-05, "loss": 2.4163, "step": 3134000 }, { "epoch": 6.561484743946143, "grad_norm": 18.066896438598633, "learning_rate": 1.722865168067932e-05, "loss": 2.408, "step": 3134500 }, { "epoch": 6.5625313996717685, "grad_norm": 16.7542781829834, "learning_rate": 1.722340742418903e-05, "loss": 2.4041, "step": 3135000 }, { "epoch": 6.563578055397394, "grad_norm": 15.702488899230957, "learning_rate": 1.7218163167698737e-05, "loss": 2.4022, "step": 3135500 }, { "epoch": 6.564624711123019, "grad_norm": 18.50126838684082, "learning_rate": 1.7212918911208444e-05, "loss": 2.4199, "step": 3136000 }, { "epoch": 6.565671366848645, "grad_norm": 16.618301391601562, "learning_rate": 1.7207674654718152e-05, "loss": 2.4251, "step": 3136500 }, { "epoch": 6.566718022574271, "grad_norm": 16.292890548706055, "learning_rate": 1.7202430398227863e-05, "loss": 2.406, "step": 3137000 }, { "epoch": 6.567764678299897, "grad_norm": 17.17815399169922, "learning_rate": 1.719718614173757e-05, "loss": 2.4245, "step": 3137500 }, { "epoch": 6.568811334025522, "grad_norm": 20.662555694580078, "learning_rate": 1.719194188524728e-05, "loss": 2.4171, "step": 3138000 }, { "epoch": 6.5698579897511475, "grad_norm": 15.390169143676758, "learning_rate": 1.7186697628756986e-05, "loss": 2.4224, "step": 3138500 }, { "epoch": 6.570904645476773, "grad_norm": 18.615257263183594, "learning_rate": 1.7181453372266694e-05, "loss": 2.427, "step": 3139000 }, { "epoch": 6.5719513012023985, "grad_norm": 20.494848251342773, "learning_rate": 1.7176209115776405e-05, "loss": 2.4189, "step": 3139500 }, { "epoch": 6.572997956928024, "grad_norm": 14.694427490234375, "learning_rate": 1.717096485928611e-05, "loss": 2.401, "step": 3140000 }, { "epoch": 6.574044612653649, "grad_norm": 14.974427223205566, "learning_rate": 1.7165720602795817e-05, "loss": 2.4163, "step": 3140500 }, { "epoch": 6.575091268379275, "grad_norm": 16.121633529663086, "learning_rate": 1.7160476346305525e-05, "loss": 2.4044, "step": 3141000 }, { "epoch": 6.5761379241049, "grad_norm": 19.531824111938477, "learning_rate": 1.7155232089815236e-05, "loss": 2.3917, "step": 3141500 }, { "epoch": 6.577184579830526, "grad_norm": 19.673133850097656, "learning_rate": 1.7149987833324943e-05, "loss": 2.4081, "step": 3142000 }, { "epoch": 6.578231235556151, "grad_norm": 21.524147033691406, "learning_rate": 1.714474357683465e-05, "loss": 2.4101, "step": 3142500 }, { "epoch": 6.579277891281777, "grad_norm": 18.154754638671875, "learning_rate": 1.713949932034436e-05, "loss": 2.408, "step": 3143000 }, { "epoch": 6.580324547007402, "grad_norm": 18.28815460205078, "learning_rate": 1.713425506385407e-05, "loss": 2.4149, "step": 3143500 }, { "epoch": 6.581371202733028, "grad_norm": 17.078977584838867, "learning_rate": 1.7129010807363778e-05, "loss": 2.4217, "step": 3144000 }, { "epoch": 6.582417858458653, "grad_norm": 17.269594192504883, "learning_rate": 1.7123766550873485e-05, "loss": 2.4191, "step": 3144500 }, { "epoch": 6.5834645141842785, "grad_norm": 20.54570770263672, "learning_rate": 1.7118522294383193e-05, "loss": 2.4041, "step": 3145000 }, { "epoch": 6.584511169909904, "grad_norm": 16.24264144897461, "learning_rate": 1.71132780378929e-05, "loss": 2.422, "step": 3145500 }, { "epoch": 6.585557825635529, "grad_norm": 15.629961013793945, "learning_rate": 1.7108033781402608e-05, "loss": 2.4162, "step": 3146000 }, { "epoch": 6.586604481361155, "grad_norm": 20.857242584228516, "learning_rate": 1.7102789524912316e-05, "loss": 2.4175, "step": 3146500 }, { "epoch": 6.58765113708678, "grad_norm": 16.857053756713867, "learning_rate": 1.7097545268422024e-05, "loss": 2.4319, "step": 3147000 }, { "epoch": 6.588697792812406, "grad_norm": 15.870168685913086, "learning_rate": 1.709230101193173e-05, "loss": 2.429, "step": 3147500 }, { "epoch": 6.589744448538031, "grad_norm": 17.183713912963867, "learning_rate": 1.7087056755441442e-05, "loss": 2.4248, "step": 3148000 }, { "epoch": 6.590791104263657, "grad_norm": 16.315692901611328, "learning_rate": 1.708181249895115e-05, "loss": 2.4187, "step": 3148500 }, { "epoch": 6.591837759989282, "grad_norm": 15.45399284362793, "learning_rate": 1.7076568242460858e-05, "loss": 2.4298, "step": 3149000 }, { "epoch": 6.592884415714908, "grad_norm": 14.848624229431152, "learning_rate": 1.7071323985970566e-05, "loss": 2.4199, "step": 3149500 }, { "epoch": 6.593931071440533, "grad_norm": 16.562822341918945, "learning_rate": 1.7066079729480273e-05, "loss": 2.4169, "step": 3150000 }, { "epoch": 6.5949777271661585, "grad_norm": 17.679466247558594, "learning_rate": 1.7060835472989984e-05, "loss": 2.4062, "step": 3150500 }, { "epoch": 6.596024382891784, "grad_norm": 17.89990234375, "learning_rate": 1.7055591216499692e-05, "loss": 2.4264, "step": 3151000 }, { "epoch": 6.597071038617409, "grad_norm": 17.561071395874023, "learning_rate": 1.7050346960009396e-05, "loss": 2.4049, "step": 3151500 }, { "epoch": 6.598117694343035, "grad_norm": 15.888259887695312, "learning_rate": 1.7045102703519107e-05, "loss": 2.4209, "step": 3152000 }, { "epoch": 6.59916435006866, "grad_norm": 18.136972427368164, "learning_rate": 1.7039858447028815e-05, "loss": 2.3993, "step": 3152500 }, { "epoch": 6.600211005794286, "grad_norm": 17.786239624023438, "learning_rate": 1.7034614190538523e-05, "loss": 2.4229, "step": 3153000 }, { "epoch": 6.601257661519911, "grad_norm": 16.034318923950195, "learning_rate": 1.702936993404823e-05, "loss": 2.427, "step": 3153500 }, { "epoch": 6.602304317245537, "grad_norm": 18.769447326660156, "learning_rate": 1.7024125677557938e-05, "loss": 2.4336, "step": 3154000 }, { "epoch": 6.603350972971162, "grad_norm": 18.82571029663086, "learning_rate": 1.701888142106765e-05, "loss": 2.4192, "step": 3154500 }, { "epoch": 6.604397628696788, "grad_norm": 15.726059913635254, "learning_rate": 1.7013637164577357e-05, "loss": 2.4048, "step": 3155000 }, { "epoch": 6.605444284422413, "grad_norm": 14.950737953186035, "learning_rate": 1.7008392908087065e-05, "loss": 2.4058, "step": 3155500 }, { "epoch": 6.6064909401480385, "grad_norm": 25.412776947021484, "learning_rate": 1.7003148651596772e-05, "loss": 2.4014, "step": 3156000 }, { "epoch": 6.607537595873665, "grad_norm": 18.19154167175293, "learning_rate": 1.699790439510648e-05, "loss": 2.4177, "step": 3156500 }, { "epoch": 6.60858425159929, "grad_norm": 14.58049488067627, "learning_rate": 1.6992660138616188e-05, "loss": 2.4215, "step": 3157000 }, { "epoch": 6.609630907324916, "grad_norm": 15.54483699798584, "learning_rate": 1.6987415882125895e-05, "loss": 2.4106, "step": 3157500 }, { "epoch": 6.610677563050541, "grad_norm": 17.955564498901367, "learning_rate": 1.6982171625635603e-05, "loss": 2.4246, "step": 3158000 }, { "epoch": 6.611724218776167, "grad_norm": 16.102170944213867, "learning_rate": 1.697692736914531e-05, "loss": 2.4097, "step": 3158500 }, { "epoch": 6.612770874501792, "grad_norm": 19.09446144104004, "learning_rate": 1.6971683112655022e-05, "loss": 2.4342, "step": 3159000 }, { "epoch": 6.613817530227418, "grad_norm": 18.590871810913086, "learning_rate": 1.696643885616473e-05, "loss": 2.409, "step": 3159500 }, { "epoch": 6.614864185953043, "grad_norm": 18.01875114440918, "learning_rate": 1.6961194599674437e-05, "loss": 2.4446, "step": 3160000 }, { "epoch": 6.6159108416786685, "grad_norm": 17.138612747192383, "learning_rate": 1.6955950343184145e-05, "loss": 2.4227, "step": 3160500 }, { "epoch": 6.616957497404294, "grad_norm": 16.905027389526367, "learning_rate": 1.6950706086693856e-05, "loss": 2.4266, "step": 3161000 }, { "epoch": 6.618004153129919, "grad_norm": 15.963515281677246, "learning_rate": 1.6945461830203564e-05, "loss": 2.4089, "step": 3161500 }, { "epoch": 6.619050808855545, "grad_norm": 20.808177947998047, "learning_rate": 1.694021757371327e-05, "loss": 2.4112, "step": 3162000 }, { "epoch": 6.62009746458117, "grad_norm": 14.128447532653809, "learning_rate": 1.6934973317222976e-05, "loss": 2.4044, "step": 3162500 }, { "epoch": 6.621144120306796, "grad_norm": 16.79754066467285, "learning_rate": 1.6929729060732687e-05, "loss": 2.4137, "step": 3163000 }, { "epoch": 6.622190776032421, "grad_norm": 14.097321510314941, "learning_rate": 1.6924484804242394e-05, "loss": 2.3961, "step": 3163500 }, { "epoch": 6.623237431758047, "grad_norm": 18.11598777770996, "learning_rate": 1.6919240547752102e-05, "loss": 2.4085, "step": 3164000 }, { "epoch": 6.624284087483672, "grad_norm": 16.968650817871094, "learning_rate": 1.691399629126181e-05, "loss": 2.4134, "step": 3164500 }, { "epoch": 6.625330743209298, "grad_norm": 21.23707389831543, "learning_rate": 1.6908752034771517e-05, "loss": 2.4163, "step": 3165000 }, { "epoch": 6.626377398934923, "grad_norm": 14.524930953979492, "learning_rate": 1.690350777828123e-05, "loss": 2.4173, "step": 3165500 }, { "epoch": 6.6274240546605485, "grad_norm": 17.398216247558594, "learning_rate": 1.6898263521790936e-05, "loss": 2.4109, "step": 3166000 }, { "epoch": 6.628470710386174, "grad_norm": 16.235252380371094, "learning_rate": 1.6893019265300644e-05, "loss": 2.4127, "step": 3166500 }, { "epoch": 6.629517366111799, "grad_norm": 16.977031707763672, "learning_rate": 1.688777500881035e-05, "loss": 2.4099, "step": 3167000 }, { "epoch": 6.630564021837425, "grad_norm": 17.362363815307617, "learning_rate": 1.688253075232006e-05, "loss": 2.4151, "step": 3167500 }, { "epoch": 6.63161067756305, "grad_norm": 23.20155906677246, "learning_rate": 1.687728649582977e-05, "loss": 2.4148, "step": 3168000 }, { "epoch": 6.632657333288676, "grad_norm": 16.914424896240234, "learning_rate": 1.6872042239339475e-05, "loss": 2.4292, "step": 3168500 }, { "epoch": 6.633703989014301, "grad_norm": 17.643327713012695, "learning_rate": 1.6866797982849182e-05, "loss": 2.4012, "step": 3169000 }, { "epoch": 6.634750644739927, "grad_norm": 16.259748458862305, "learning_rate": 1.6861553726358893e-05, "loss": 2.4091, "step": 3169500 }, { "epoch": 6.635797300465552, "grad_norm": 17.399959564208984, "learning_rate": 1.68563094698686e-05, "loss": 2.403, "step": 3170000 }, { "epoch": 6.636843956191178, "grad_norm": 15.880574226379395, "learning_rate": 1.685106521337831e-05, "loss": 2.4136, "step": 3170500 }, { "epoch": 6.637890611916803, "grad_norm": 17.962427139282227, "learning_rate": 1.6845820956888017e-05, "loss": 2.4095, "step": 3171000 }, { "epoch": 6.6389372676424285, "grad_norm": 16.40839385986328, "learning_rate": 1.6840576700397724e-05, "loss": 2.4025, "step": 3171500 }, { "epoch": 6.639983923368055, "grad_norm": 17.6927490234375, "learning_rate": 1.6835332443907435e-05, "loss": 2.4255, "step": 3172000 }, { "epoch": 6.64103057909368, "grad_norm": 18.780384063720703, "learning_rate": 1.6830088187417143e-05, "loss": 2.4068, "step": 3172500 }, { "epoch": 6.642077234819306, "grad_norm": 18.329565048217773, "learning_rate": 1.682484393092685e-05, "loss": 2.4133, "step": 3173000 }, { "epoch": 6.643123890544931, "grad_norm": 24.279394149780273, "learning_rate": 1.681959967443656e-05, "loss": 2.4276, "step": 3173500 }, { "epoch": 6.644170546270557, "grad_norm": 18.084468841552734, "learning_rate": 1.6814355417946266e-05, "loss": 2.403, "step": 3174000 }, { "epoch": 6.645217201996182, "grad_norm": 17.207948684692383, "learning_rate": 1.6809111161455974e-05, "loss": 2.4077, "step": 3174500 }, { "epoch": 6.646263857721808, "grad_norm": 16.998931884765625, "learning_rate": 1.680386690496568e-05, "loss": 2.4015, "step": 3175000 }, { "epoch": 6.647310513447433, "grad_norm": 17.521711349487305, "learning_rate": 1.679862264847539e-05, "loss": 2.4211, "step": 3175500 }, { "epoch": 6.6483571691730585, "grad_norm": 16.26783561706543, "learning_rate": 1.6793378391985097e-05, "loss": 2.4131, "step": 3176000 }, { "epoch": 6.649403824898684, "grad_norm": 17.369129180908203, "learning_rate": 1.6788134135494808e-05, "loss": 2.4496, "step": 3176500 }, { "epoch": 6.650450480624309, "grad_norm": 22.74053382873535, "learning_rate": 1.6782889879004516e-05, "loss": 2.3908, "step": 3177000 }, { "epoch": 6.651497136349935, "grad_norm": 15.64148998260498, "learning_rate": 1.6777645622514223e-05, "loss": 2.4128, "step": 3177500 }, { "epoch": 6.65254379207556, "grad_norm": 17.83123016357422, "learning_rate": 1.677240136602393e-05, "loss": 2.4214, "step": 3178000 }, { "epoch": 6.653590447801186, "grad_norm": 17.200700759887695, "learning_rate": 1.6767157109533642e-05, "loss": 2.4231, "step": 3178500 }, { "epoch": 6.654637103526811, "grad_norm": 17.01881980895996, "learning_rate": 1.676191285304335e-05, "loss": 2.4112, "step": 3179000 }, { "epoch": 6.655683759252437, "grad_norm": 16.267107009887695, "learning_rate": 1.6756668596553054e-05, "loss": 2.4266, "step": 3179500 }, { "epoch": 6.656730414978062, "grad_norm": 17.69097328186035, "learning_rate": 1.6751424340062762e-05, "loss": 2.4181, "step": 3180000 }, { "epoch": 6.657777070703688, "grad_norm": 13.976819038391113, "learning_rate": 1.6746180083572473e-05, "loss": 2.4041, "step": 3180500 }, { "epoch": 6.658823726429313, "grad_norm": 17.632413864135742, "learning_rate": 1.674093582708218e-05, "loss": 2.4128, "step": 3181000 }, { "epoch": 6.6598703821549385, "grad_norm": 16.281885147094727, "learning_rate": 1.6735691570591888e-05, "loss": 2.4149, "step": 3181500 }, { "epoch": 6.660917037880564, "grad_norm": 15.61105728149414, "learning_rate": 1.6730447314101596e-05, "loss": 2.4092, "step": 3182000 }, { "epoch": 6.6619636936061895, "grad_norm": 14.04416561126709, "learning_rate": 1.6725203057611304e-05, "loss": 2.4223, "step": 3182500 }, { "epoch": 6.663010349331815, "grad_norm": 21.24778938293457, "learning_rate": 1.6719958801121015e-05, "loss": 2.4201, "step": 3183000 }, { "epoch": 6.66405700505744, "grad_norm": 16.96368408203125, "learning_rate": 1.6714714544630722e-05, "loss": 2.4122, "step": 3183500 }, { "epoch": 6.665103660783066, "grad_norm": 17.340347290039062, "learning_rate": 1.670947028814043e-05, "loss": 2.3919, "step": 3184000 }, { "epoch": 6.666150316508691, "grad_norm": 20.432544708251953, "learning_rate": 1.6704226031650138e-05, "loss": 2.415, "step": 3184500 }, { "epoch": 6.667196972234317, "grad_norm": 16.400083541870117, "learning_rate": 1.6698981775159845e-05, "loss": 2.4079, "step": 3185000 }, { "epoch": 6.668243627959942, "grad_norm": 17.026622772216797, "learning_rate": 1.6693737518669553e-05, "loss": 2.4014, "step": 3185500 }, { "epoch": 6.669290283685568, "grad_norm": 17.759244918823242, "learning_rate": 1.668849326217926e-05, "loss": 2.397, "step": 3186000 }, { "epoch": 6.670336939411193, "grad_norm": 19.456350326538086, "learning_rate": 1.668324900568897e-05, "loss": 2.3961, "step": 3186500 }, { "epoch": 6.671383595136819, "grad_norm": 16.73568344116211, "learning_rate": 1.667800474919868e-05, "loss": 2.4061, "step": 3187000 }, { "epoch": 6.672430250862444, "grad_norm": 17.378034591674805, "learning_rate": 1.6672760492708387e-05, "loss": 2.3972, "step": 3187500 }, { "epoch": 6.6734769065880695, "grad_norm": 16.13705825805664, "learning_rate": 1.6667516236218095e-05, "loss": 2.4043, "step": 3188000 }, { "epoch": 6.674523562313695, "grad_norm": 18.418081283569336, "learning_rate": 1.6662271979727803e-05, "loss": 2.4109, "step": 3188500 }, { "epoch": 6.67557021803932, "grad_norm": 19.196748733520508, "learning_rate": 1.665702772323751e-05, "loss": 2.4207, "step": 3189000 }, { "epoch": 6.676616873764946, "grad_norm": 18.98824691772461, "learning_rate": 1.665178346674722e-05, "loss": 2.4177, "step": 3189500 }, { "epoch": 6.677663529490571, "grad_norm": 17.34296989440918, "learning_rate": 1.664653921025693e-05, "loss": 2.4224, "step": 3190000 }, { "epoch": 6.678710185216197, "grad_norm": 16.58908462524414, "learning_rate": 1.6641294953766637e-05, "loss": 2.4028, "step": 3190500 }, { "epoch": 6.679756840941822, "grad_norm": 17.30184555053711, "learning_rate": 1.663605069727634e-05, "loss": 2.4076, "step": 3191000 }, { "epoch": 6.6808034966674485, "grad_norm": 19.348966598510742, "learning_rate": 1.6630806440786052e-05, "loss": 2.4004, "step": 3191500 }, { "epoch": 6.681850152393074, "grad_norm": 17.205570220947266, "learning_rate": 1.662556218429576e-05, "loss": 2.3935, "step": 3192000 }, { "epoch": 6.6828968081186995, "grad_norm": 18.813453674316406, "learning_rate": 1.6620317927805468e-05, "loss": 2.4165, "step": 3192500 }, { "epoch": 6.683943463844325, "grad_norm": 14.798296928405762, "learning_rate": 1.6615073671315175e-05, "loss": 2.4056, "step": 3193000 }, { "epoch": 6.68499011956995, "grad_norm": 19.95932388305664, "learning_rate": 1.6609829414824883e-05, "loss": 2.3981, "step": 3193500 }, { "epoch": 6.686036775295576, "grad_norm": 17.9518985748291, "learning_rate": 1.6604585158334594e-05, "loss": 2.4082, "step": 3194000 }, { "epoch": 6.687083431021201, "grad_norm": 17.047964096069336, "learning_rate": 1.6599340901844302e-05, "loss": 2.408, "step": 3194500 }, { "epoch": 6.688130086746827, "grad_norm": 15.224348068237305, "learning_rate": 1.659409664535401e-05, "loss": 2.4067, "step": 3195000 }, { "epoch": 6.689176742472452, "grad_norm": 16.611228942871094, "learning_rate": 1.6588852388863717e-05, "loss": 2.418, "step": 3195500 }, { "epoch": 6.690223398198078, "grad_norm": 19.501319885253906, "learning_rate": 1.6583608132373428e-05, "loss": 2.4073, "step": 3196000 }, { "epoch": 6.691270053923703, "grad_norm": 21.37432861328125, "learning_rate": 1.6578363875883133e-05, "loss": 2.3913, "step": 3196500 }, { "epoch": 6.692316709649329, "grad_norm": 14.090328216552734, "learning_rate": 1.657311961939284e-05, "loss": 2.4163, "step": 3197000 }, { "epoch": 6.693363365374954, "grad_norm": 19.105009078979492, "learning_rate": 1.6567875362902548e-05, "loss": 2.4303, "step": 3197500 }, { "epoch": 6.6944100211005795, "grad_norm": 17.638071060180664, "learning_rate": 1.656263110641226e-05, "loss": 2.4004, "step": 3198000 }, { "epoch": 6.695456676826205, "grad_norm": 19.196683883666992, "learning_rate": 1.6557386849921967e-05, "loss": 2.3885, "step": 3198500 }, { "epoch": 6.69650333255183, "grad_norm": 18.17386817932129, "learning_rate": 1.6552142593431674e-05, "loss": 2.4011, "step": 3199000 }, { "epoch": 6.697549988277456, "grad_norm": 16.804250717163086, "learning_rate": 1.6546898336941382e-05, "loss": 2.408, "step": 3199500 }, { "epoch": 6.698596644003081, "grad_norm": 18.309764862060547, "learning_rate": 1.654165408045109e-05, "loss": 2.4035, "step": 3200000 }, { "epoch": 6.699643299728707, "grad_norm": 16.67054557800293, "learning_rate": 1.65364098239608e-05, "loss": 2.4187, "step": 3200500 }, { "epoch": 6.700689955454332, "grad_norm": 15.163799285888672, "learning_rate": 1.653116556747051e-05, "loss": 2.4056, "step": 3201000 }, { "epoch": 6.701736611179958, "grad_norm": 15.977714538574219, "learning_rate": 1.6525921310980216e-05, "loss": 2.401, "step": 3201500 }, { "epoch": 6.702783266905583, "grad_norm": 20.1727352142334, "learning_rate": 1.6520677054489924e-05, "loss": 2.397, "step": 3202000 }, { "epoch": 6.703829922631209, "grad_norm": 18.211772918701172, "learning_rate": 1.651543279799963e-05, "loss": 2.4287, "step": 3202500 }, { "epoch": 6.704876578356834, "grad_norm": 18.811538696289062, "learning_rate": 1.651018854150934e-05, "loss": 2.4219, "step": 3203000 }, { "epoch": 6.7059232340824595, "grad_norm": 16.22759437561035, "learning_rate": 1.6504944285019047e-05, "loss": 2.4065, "step": 3203500 }, { "epoch": 6.706969889808085, "grad_norm": 15.521438598632812, "learning_rate": 1.6499700028528755e-05, "loss": 2.4198, "step": 3204000 }, { "epoch": 6.70801654553371, "grad_norm": 21.602678298950195, "learning_rate": 1.6494455772038462e-05, "loss": 2.4167, "step": 3204500 }, { "epoch": 6.709063201259336, "grad_norm": 21.366641998291016, "learning_rate": 1.6489211515548173e-05, "loss": 2.4116, "step": 3205000 }, { "epoch": 6.710109856984961, "grad_norm": 19.22562599182129, "learning_rate": 1.648396725905788e-05, "loss": 2.4277, "step": 3205500 }, { "epoch": 6.711156512710587, "grad_norm": 15.340188026428223, "learning_rate": 1.647872300256759e-05, "loss": 2.4217, "step": 3206000 }, { "epoch": 6.712203168436212, "grad_norm": 20.795413970947266, "learning_rate": 1.6473478746077297e-05, "loss": 2.4102, "step": 3206500 }, { "epoch": 6.713249824161839, "grad_norm": 17.49630355834961, "learning_rate": 1.6468234489587008e-05, "loss": 2.4358, "step": 3207000 }, { "epoch": 6.714296479887464, "grad_norm": 17.866559982299805, "learning_rate": 1.6462990233096715e-05, "loss": 2.4109, "step": 3207500 }, { "epoch": 6.7153431356130895, "grad_norm": 15.855066299438477, "learning_rate": 1.645774597660642e-05, "loss": 2.4147, "step": 3208000 }, { "epoch": 6.716389791338715, "grad_norm": 16.366405487060547, "learning_rate": 1.6452501720116127e-05, "loss": 2.4004, "step": 3208500 }, { "epoch": 6.71743644706434, "grad_norm": 17.954835891723633, "learning_rate": 1.644725746362584e-05, "loss": 2.4214, "step": 3209000 }, { "epoch": 6.718483102789966, "grad_norm": 17.339176177978516, "learning_rate": 1.6442013207135546e-05, "loss": 2.3908, "step": 3209500 }, { "epoch": 6.719529758515591, "grad_norm": 32.12181091308594, "learning_rate": 1.6436768950645254e-05, "loss": 2.4089, "step": 3210000 }, { "epoch": 6.720576414241217, "grad_norm": 17.17961311340332, "learning_rate": 1.643152469415496e-05, "loss": 2.4082, "step": 3210500 }, { "epoch": 6.721623069966842, "grad_norm": 18.56057357788086, "learning_rate": 1.642628043766467e-05, "loss": 2.4115, "step": 3211000 }, { "epoch": 6.722669725692468, "grad_norm": 18.745325088500977, "learning_rate": 1.642103618117438e-05, "loss": 2.4092, "step": 3211500 }, { "epoch": 6.723716381418093, "grad_norm": 15.681438446044922, "learning_rate": 1.6415791924684088e-05, "loss": 2.4292, "step": 3212000 }, { "epoch": 6.724763037143719, "grad_norm": 18.50899887084961, "learning_rate": 1.6410547668193796e-05, "loss": 2.4257, "step": 3212500 }, { "epoch": 6.725809692869344, "grad_norm": 19.16826820373535, "learning_rate": 1.6405303411703503e-05, "loss": 2.4127, "step": 3213000 }, { "epoch": 6.7268563485949695, "grad_norm": 16.42657470703125, "learning_rate": 1.640005915521321e-05, "loss": 2.3897, "step": 3213500 }, { "epoch": 6.727903004320595, "grad_norm": 16.764509201049805, "learning_rate": 1.639481489872292e-05, "loss": 2.4091, "step": 3214000 }, { "epoch": 6.72894966004622, "grad_norm": 20.47132110595703, "learning_rate": 1.6389570642232626e-05, "loss": 2.4093, "step": 3214500 }, { "epoch": 6.729996315771846, "grad_norm": 19.302675247192383, "learning_rate": 1.6384326385742334e-05, "loss": 2.3974, "step": 3215000 }, { "epoch": 6.731042971497471, "grad_norm": 17.274627685546875, "learning_rate": 1.6379082129252045e-05, "loss": 2.4168, "step": 3215500 }, { "epoch": 6.732089627223097, "grad_norm": 15.475262641906738, "learning_rate": 1.6373837872761753e-05, "loss": 2.4023, "step": 3216000 }, { "epoch": 6.733136282948722, "grad_norm": 18.263547897338867, "learning_rate": 1.636859361627146e-05, "loss": 2.3981, "step": 3216500 }, { "epoch": 6.734182938674348, "grad_norm": 19.533811569213867, "learning_rate": 1.6363349359781168e-05, "loss": 2.3931, "step": 3217000 }, { "epoch": 6.735229594399973, "grad_norm": 16.80107879638672, "learning_rate": 1.6358105103290876e-05, "loss": 2.4103, "step": 3217500 }, { "epoch": 6.736276250125599, "grad_norm": 18.93090057373047, "learning_rate": 1.6352860846800587e-05, "loss": 2.4152, "step": 3218000 }, { "epoch": 6.737322905851224, "grad_norm": 17.224750518798828, "learning_rate": 1.6347616590310295e-05, "loss": 2.3982, "step": 3218500 }, { "epoch": 6.7383695615768495, "grad_norm": 15.068963050842285, "learning_rate": 1.6342372333820002e-05, "loss": 2.4061, "step": 3219000 }, { "epoch": 6.739416217302475, "grad_norm": 21.933015823364258, "learning_rate": 1.6337128077329707e-05, "loss": 2.4011, "step": 3219500 }, { "epoch": 6.7404628730281, "grad_norm": 17.928178787231445, "learning_rate": 1.6331883820839418e-05, "loss": 2.4188, "step": 3220000 }, { "epoch": 6.741509528753726, "grad_norm": 18.812047958374023, "learning_rate": 1.6326639564349125e-05, "loss": 2.4017, "step": 3220500 }, { "epoch": 6.742556184479351, "grad_norm": 16.85081672668457, "learning_rate": 1.6321395307858833e-05, "loss": 2.3965, "step": 3221000 }, { "epoch": 6.743602840204977, "grad_norm": 17.258468627929688, "learning_rate": 1.631615105136854e-05, "loss": 2.4114, "step": 3221500 }, { "epoch": 6.744649495930602, "grad_norm": 14.979782104492188, "learning_rate": 1.631090679487825e-05, "loss": 2.4007, "step": 3222000 }, { "epoch": 6.745696151656228, "grad_norm": 16.481943130493164, "learning_rate": 1.630566253838796e-05, "loss": 2.4089, "step": 3222500 }, { "epoch": 6.746742807381853, "grad_norm": 17.850149154663086, "learning_rate": 1.6300418281897667e-05, "loss": 2.3874, "step": 3223000 }, { "epoch": 6.747789463107479, "grad_norm": 15.858427047729492, "learning_rate": 1.6295174025407375e-05, "loss": 2.4143, "step": 3223500 }, { "epoch": 6.748836118833104, "grad_norm": 13.8504056930542, "learning_rate": 1.6289929768917083e-05, "loss": 2.4039, "step": 3224000 }, { "epoch": 6.7498827745587295, "grad_norm": 20.942529678344727, "learning_rate": 1.6284685512426794e-05, "loss": 2.4152, "step": 3224500 }, { "epoch": 6.750929430284355, "grad_norm": 16.16602325439453, "learning_rate": 1.6279441255936498e-05, "loss": 2.4131, "step": 3225000 }, { "epoch": 6.7519760860099804, "grad_norm": 19.73915672302246, "learning_rate": 1.6274196999446206e-05, "loss": 2.4202, "step": 3225500 }, { "epoch": 6.753022741735606, "grad_norm": 16.633073806762695, "learning_rate": 1.6268952742955913e-05, "loss": 2.407, "step": 3226000 }, { "epoch": 6.754069397461232, "grad_norm": 17.07338523864746, "learning_rate": 1.6263708486465624e-05, "loss": 2.421, "step": 3226500 }, { "epoch": 6.755116053186858, "grad_norm": 18.901073455810547, "learning_rate": 1.6258464229975332e-05, "loss": 2.394, "step": 3227000 }, { "epoch": 6.756162708912483, "grad_norm": 15.989130020141602, "learning_rate": 1.625321997348504e-05, "loss": 2.3958, "step": 3227500 }, { "epoch": 6.757209364638109, "grad_norm": 18.316265106201172, "learning_rate": 1.6247975716994748e-05, "loss": 2.4104, "step": 3228000 }, { "epoch": 6.758256020363734, "grad_norm": 18.490625381469727, "learning_rate": 1.6242731460504455e-05, "loss": 2.4035, "step": 3228500 }, { "epoch": 6.7593026760893595, "grad_norm": 17.441099166870117, "learning_rate": 1.6237487204014166e-05, "loss": 2.3888, "step": 3229000 }, { "epoch": 6.760349331814985, "grad_norm": 15.834283828735352, "learning_rate": 1.6232242947523874e-05, "loss": 2.4134, "step": 3229500 }, { "epoch": 6.76139598754061, "grad_norm": 19.235158920288086, "learning_rate": 1.6226998691033582e-05, "loss": 2.3978, "step": 3230000 }, { "epoch": 6.762442643266236, "grad_norm": 16.863853454589844, "learning_rate": 1.6221754434543286e-05, "loss": 2.4221, "step": 3230500 }, { "epoch": 6.763489298991861, "grad_norm": 21.43692970275879, "learning_rate": 1.6216510178052997e-05, "loss": 2.4053, "step": 3231000 }, { "epoch": 6.764535954717487, "grad_norm": 16.930574417114258, "learning_rate": 1.6211265921562705e-05, "loss": 2.4284, "step": 3231500 }, { "epoch": 6.765582610443112, "grad_norm": 16.290159225463867, "learning_rate": 1.6206021665072412e-05, "loss": 2.3997, "step": 3232000 }, { "epoch": 6.766629266168738, "grad_norm": 18.336883544921875, "learning_rate": 1.620077740858212e-05, "loss": 2.4126, "step": 3232500 }, { "epoch": 6.767675921894363, "grad_norm": 16.817190170288086, "learning_rate": 1.619553315209183e-05, "loss": 2.409, "step": 3233000 }, { "epoch": 6.768722577619989, "grad_norm": 16.537189483642578, "learning_rate": 1.619028889560154e-05, "loss": 2.4191, "step": 3233500 }, { "epoch": 6.769769233345614, "grad_norm": 18.359281539916992, "learning_rate": 1.6185044639111247e-05, "loss": 2.4182, "step": 3234000 }, { "epoch": 6.7708158890712395, "grad_norm": 19.641496658325195, "learning_rate": 1.6179800382620954e-05, "loss": 2.4018, "step": 3234500 }, { "epoch": 6.771862544796865, "grad_norm": 19.788105010986328, "learning_rate": 1.6174556126130662e-05, "loss": 2.401, "step": 3235000 }, { "epoch": 6.7729092005224905, "grad_norm": 18.895732879638672, "learning_rate": 1.6169311869640373e-05, "loss": 2.3997, "step": 3235500 }, { "epoch": 6.773955856248116, "grad_norm": 17.551185607910156, "learning_rate": 1.616406761315008e-05, "loss": 2.4158, "step": 3236000 }, { "epoch": 6.775002511973741, "grad_norm": 15.795044898986816, "learning_rate": 1.6158823356659785e-05, "loss": 2.4097, "step": 3236500 }, { "epoch": 6.776049167699367, "grad_norm": 14.975709915161133, "learning_rate": 1.6153579100169493e-05, "loss": 2.4144, "step": 3237000 }, { "epoch": 6.777095823424992, "grad_norm": 15.169975280761719, "learning_rate": 1.6148334843679204e-05, "loss": 2.4121, "step": 3237500 }, { "epoch": 6.778142479150618, "grad_norm": 17.455476760864258, "learning_rate": 1.614309058718891e-05, "loss": 2.41, "step": 3238000 }, { "epoch": 6.779189134876243, "grad_norm": 17.060121536254883, "learning_rate": 1.613784633069862e-05, "loss": 2.4216, "step": 3238500 }, { "epoch": 6.780235790601869, "grad_norm": 16.60721206665039, "learning_rate": 1.6132602074208327e-05, "loss": 2.389, "step": 3239000 }, { "epoch": 6.781282446327494, "grad_norm": 19.164867401123047, "learning_rate": 1.6127357817718035e-05, "loss": 2.4045, "step": 3239500 }, { "epoch": 6.78232910205312, "grad_norm": 15.520977020263672, "learning_rate": 1.6122113561227746e-05, "loss": 2.4026, "step": 3240000 }, { "epoch": 6.783375757778745, "grad_norm": 24.707612991333008, "learning_rate": 1.6116869304737453e-05, "loss": 2.3947, "step": 3240500 }, { "epoch": 6.7844224135043705, "grad_norm": 15.827875137329102, "learning_rate": 1.611162504824716e-05, "loss": 2.404, "step": 3241000 }, { "epoch": 6.785469069229996, "grad_norm": 15.073210716247559, "learning_rate": 1.610638079175687e-05, "loss": 2.4051, "step": 3241500 }, { "epoch": 6.786515724955622, "grad_norm": 16.809309005737305, "learning_rate": 1.6101136535266576e-05, "loss": 2.4107, "step": 3242000 }, { "epoch": 6.787562380681248, "grad_norm": 15.232001304626465, "learning_rate": 1.6095892278776284e-05, "loss": 2.4, "step": 3242500 }, { "epoch": 6.788609036406873, "grad_norm": 16.489593505859375, "learning_rate": 1.6090648022285992e-05, "loss": 2.3937, "step": 3243000 }, { "epoch": 6.789655692132499, "grad_norm": 14.549190521240234, "learning_rate": 1.60854037657957e-05, "loss": 2.3952, "step": 3243500 }, { "epoch": 6.790702347858124, "grad_norm": 21.933732986450195, "learning_rate": 1.608015950930541e-05, "loss": 2.3981, "step": 3244000 }, { "epoch": 6.7917490035837496, "grad_norm": 19.934494018554688, "learning_rate": 1.607491525281512e-05, "loss": 2.4079, "step": 3244500 }, { "epoch": 6.792795659309375, "grad_norm": 19.026124954223633, "learning_rate": 1.6069670996324826e-05, "loss": 2.407, "step": 3245000 }, { "epoch": 6.7938423150350005, "grad_norm": 17.5217342376709, "learning_rate": 1.6064426739834534e-05, "loss": 2.4051, "step": 3245500 }, { "epoch": 6.794888970760626, "grad_norm": 16.386375427246094, "learning_rate": 1.605918248334424e-05, "loss": 2.4273, "step": 3246000 }, { "epoch": 6.795935626486251, "grad_norm": 15.875527381896973, "learning_rate": 1.6053938226853952e-05, "loss": 2.4145, "step": 3246500 }, { "epoch": 6.796982282211877, "grad_norm": 16.981788635253906, "learning_rate": 1.604869397036366e-05, "loss": 2.3958, "step": 3247000 }, { "epoch": 6.798028937937502, "grad_norm": 20.57143211364746, "learning_rate": 1.6043449713873368e-05, "loss": 2.4024, "step": 3247500 }, { "epoch": 6.799075593663128, "grad_norm": 14.841670989990234, "learning_rate": 1.6038205457383072e-05, "loss": 2.3999, "step": 3248000 }, { "epoch": 6.800122249388753, "grad_norm": 18.63774299621582, "learning_rate": 1.6032961200892783e-05, "loss": 2.4006, "step": 3248500 }, { "epoch": 6.801168905114379, "grad_norm": 17.10550880432129, "learning_rate": 1.602771694440249e-05, "loss": 2.413, "step": 3249000 }, { "epoch": 6.802215560840004, "grad_norm": 13.995051383972168, "learning_rate": 1.60224726879122e-05, "loss": 2.4057, "step": 3249500 }, { "epoch": 6.80326221656563, "grad_norm": 16.47176170349121, "learning_rate": 1.6017228431421906e-05, "loss": 2.4109, "step": 3250000 }, { "epoch": 6.804308872291255, "grad_norm": 19.466856002807617, "learning_rate": 1.6011984174931617e-05, "loss": 2.4008, "step": 3250500 }, { "epoch": 6.8053555280168805, "grad_norm": 18.288188934326172, "learning_rate": 1.6006739918441325e-05, "loss": 2.4014, "step": 3251000 }, { "epoch": 6.806402183742506, "grad_norm": 20.373031616210938, "learning_rate": 1.6001495661951033e-05, "loss": 2.39, "step": 3251500 }, { "epoch": 6.807448839468131, "grad_norm": 18.496402740478516, "learning_rate": 1.599625140546074e-05, "loss": 2.3873, "step": 3252000 }, { "epoch": 6.808495495193757, "grad_norm": 18.431644439697266, "learning_rate": 1.5991007148970448e-05, "loss": 2.3993, "step": 3252500 }, { "epoch": 6.809542150919382, "grad_norm": 18.62577247619629, "learning_rate": 1.598576289248016e-05, "loss": 2.4024, "step": 3253000 }, { "epoch": 6.810588806645008, "grad_norm": 18.849735260009766, "learning_rate": 1.5980518635989864e-05, "loss": 2.4139, "step": 3253500 }, { "epoch": 6.811635462370633, "grad_norm": 20.11202621459961, "learning_rate": 1.597527437949957e-05, "loss": 2.4145, "step": 3254000 }, { "epoch": 6.812682118096259, "grad_norm": 15.256512641906738, "learning_rate": 1.597003012300928e-05, "loss": 2.3953, "step": 3254500 }, { "epoch": 6.813728773821884, "grad_norm": 19.48151397705078, "learning_rate": 1.596478586651899e-05, "loss": 2.3808, "step": 3255000 }, { "epoch": 6.81477542954751, "grad_norm": 19.591028213500977, "learning_rate": 1.5959541610028698e-05, "loss": 2.4098, "step": 3255500 }, { "epoch": 6.815822085273135, "grad_norm": 18.46324920654297, "learning_rate": 1.5954297353538405e-05, "loss": 2.3879, "step": 3256000 }, { "epoch": 6.8168687409987605, "grad_norm": 15.166866302490234, "learning_rate": 1.5949053097048113e-05, "loss": 2.404, "step": 3256500 }, { "epoch": 6.817915396724386, "grad_norm": 14.756194114685059, "learning_rate": 1.594380884055782e-05, "loss": 2.4064, "step": 3257000 }, { "epoch": 6.818962052450011, "grad_norm": 17.788148880004883, "learning_rate": 1.5938564584067532e-05, "loss": 2.3879, "step": 3257500 }, { "epoch": 6.820008708175637, "grad_norm": 14.571598052978516, "learning_rate": 1.593332032757724e-05, "loss": 2.4127, "step": 3258000 }, { "epoch": 6.821055363901262, "grad_norm": 18.363080978393555, "learning_rate": 1.5928076071086947e-05, "loss": 2.4093, "step": 3258500 }, { "epoch": 6.822102019626888, "grad_norm": 17.689050674438477, "learning_rate": 1.592283181459665e-05, "loss": 2.4164, "step": 3259000 }, { "epoch": 6.823148675352513, "grad_norm": 15.493488311767578, "learning_rate": 1.5917587558106363e-05, "loss": 2.4046, "step": 3259500 }, { "epoch": 6.824195331078139, "grad_norm": 18.98586654663086, "learning_rate": 1.591234330161607e-05, "loss": 2.3942, "step": 3260000 }, { "epoch": 6.825241986803764, "grad_norm": 16.57428741455078, "learning_rate": 1.5907099045125778e-05, "loss": 2.4191, "step": 3260500 }, { "epoch": 6.82628864252939, "grad_norm": 18.82330322265625, "learning_rate": 1.5901854788635486e-05, "loss": 2.4003, "step": 3261000 }, { "epoch": 6.827335298255016, "grad_norm": 18.686296463012695, "learning_rate": 1.5896610532145197e-05, "loss": 2.4033, "step": 3261500 }, { "epoch": 6.828381953980641, "grad_norm": 19.784399032592773, "learning_rate": 1.5891366275654904e-05, "loss": 2.401, "step": 3262000 }, { "epoch": 6.829428609706267, "grad_norm": 14.017265319824219, "learning_rate": 1.5886122019164612e-05, "loss": 2.3872, "step": 3262500 }, { "epoch": 6.830475265431892, "grad_norm": 17.663955688476562, "learning_rate": 1.588087776267432e-05, "loss": 2.422, "step": 3263000 }, { "epoch": 6.831521921157518, "grad_norm": 19.775592803955078, "learning_rate": 1.5875633506184028e-05, "loss": 2.4057, "step": 3263500 }, { "epoch": 6.832568576883143, "grad_norm": 14.829655647277832, "learning_rate": 1.587038924969374e-05, "loss": 2.4247, "step": 3264000 }, { "epoch": 6.833615232608769, "grad_norm": 15.325477600097656, "learning_rate": 1.5865144993203446e-05, "loss": 2.4145, "step": 3264500 }, { "epoch": 6.834661888334394, "grad_norm": 20.965728759765625, "learning_rate": 1.585990073671315e-05, "loss": 2.3845, "step": 3265000 }, { "epoch": 6.83570854406002, "grad_norm": 16.34764289855957, "learning_rate": 1.5854656480222858e-05, "loss": 2.4073, "step": 3265500 }, { "epoch": 6.836755199785645, "grad_norm": 16.425817489624023, "learning_rate": 1.584941222373257e-05, "loss": 2.4058, "step": 3266000 }, { "epoch": 6.8378018555112705, "grad_norm": 15.672930717468262, "learning_rate": 1.5844167967242277e-05, "loss": 2.3974, "step": 3266500 }, { "epoch": 6.838848511236896, "grad_norm": 19.171058654785156, "learning_rate": 1.5838923710751985e-05, "loss": 2.3903, "step": 3267000 }, { "epoch": 6.839895166962521, "grad_norm": 16.52620506286621, "learning_rate": 1.5833679454261692e-05, "loss": 2.4012, "step": 3267500 }, { "epoch": 6.840941822688147, "grad_norm": 18.53632164001465, "learning_rate": 1.58284351977714e-05, "loss": 2.3864, "step": 3268000 }, { "epoch": 6.841988478413772, "grad_norm": 23.8011531829834, "learning_rate": 1.582319094128111e-05, "loss": 2.4116, "step": 3268500 }, { "epoch": 6.843035134139398, "grad_norm": 16.936403274536133, "learning_rate": 1.581794668479082e-05, "loss": 2.3809, "step": 3269000 }, { "epoch": 6.844081789865023, "grad_norm": 16.93305206298828, "learning_rate": 1.5812702428300527e-05, "loss": 2.3929, "step": 3269500 }, { "epoch": 6.845128445590649, "grad_norm": 16.0623722076416, "learning_rate": 1.5807458171810234e-05, "loss": 2.4082, "step": 3270000 }, { "epoch": 6.846175101316274, "grad_norm": 15.843059539794922, "learning_rate": 1.5802213915319942e-05, "loss": 2.4104, "step": 3270500 }, { "epoch": 6.8472217570419, "grad_norm": 16.421703338623047, "learning_rate": 1.579696965882965e-05, "loss": 2.4201, "step": 3271000 }, { "epoch": 6.848268412767525, "grad_norm": 19.232662200927734, "learning_rate": 1.5791725402339357e-05, "loss": 2.3868, "step": 3271500 }, { "epoch": 6.8493150684931505, "grad_norm": 19.10182762145996, "learning_rate": 1.5786481145849065e-05, "loss": 2.4149, "step": 3272000 }, { "epoch": 6.850361724218776, "grad_norm": 18.903728485107422, "learning_rate": 1.5781236889358776e-05, "loss": 2.397, "step": 3272500 }, { "epoch": 6.851408379944401, "grad_norm": 17.335479736328125, "learning_rate": 1.5775992632868484e-05, "loss": 2.4037, "step": 3273000 }, { "epoch": 6.852455035670027, "grad_norm": 19.06414031982422, "learning_rate": 1.577074837637819e-05, "loss": 2.4047, "step": 3273500 }, { "epoch": 6.853501691395652, "grad_norm": 19.594331741333008, "learning_rate": 1.57655041198879e-05, "loss": 2.4103, "step": 3274000 }, { "epoch": 6.854548347121278, "grad_norm": 17.417499542236328, "learning_rate": 1.5760259863397607e-05, "loss": 2.4053, "step": 3274500 }, { "epoch": 6.855595002846903, "grad_norm": 17.68454360961914, "learning_rate": 1.5755015606907318e-05, "loss": 2.3966, "step": 3275000 }, { "epoch": 6.856641658572529, "grad_norm": 15.062809944152832, "learning_rate": 1.5749771350417026e-05, "loss": 2.3852, "step": 3275500 }, { "epoch": 6.857688314298154, "grad_norm": 16.213943481445312, "learning_rate": 1.574452709392673e-05, "loss": 2.3985, "step": 3276000 }, { "epoch": 6.85873497002378, "grad_norm": 18.24521255493164, "learning_rate": 1.5739282837436438e-05, "loss": 2.4045, "step": 3276500 }, { "epoch": 6.859781625749406, "grad_norm": 15.838288307189941, "learning_rate": 1.573403858094615e-05, "loss": 2.4051, "step": 3277000 }, { "epoch": 6.860828281475031, "grad_norm": 18.387100219726562, "learning_rate": 1.5728794324455856e-05, "loss": 2.3804, "step": 3277500 }, { "epoch": 6.861874937200657, "grad_norm": 20.522377014160156, "learning_rate": 1.5723550067965564e-05, "loss": 2.4202, "step": 3278000 }, { "epoch": 6.862921592926282, "grad_norm": 16.001859664916992, "learning_rate": 1.5718305811475272e-05, "loss": 2.407, "step": 3278500 }, { "epoch": 6.863968248651908, "grad_norm": 16.495655059814453, "learning_rate": 1.5713061554984983e-05, "loss": 2.3912, "step": 3279000 }, { "epoch": 6.865014904377533, "grad_norm": 18.193950653076172, "learning_rate": 1.570781729849469e-05, "loss": 2.4032, "step": 3279500 }, { "epoch": 6.866061560103159, "grad_norm": 18.600025177001953, "learning_rate": 1.5702573042004398e-05, "loss": 2.4073, "step": 3280000 }, { "epoch": 6.867108215828784, "grad_norm": 20.19570541381836, "learning_rate": 1.5697328785514106e-05, "loss": 2.4023, "step": 3280500 }, { "epoch": 6.86815487155441, "grad_norm": 17.576038360595703, "learning_rate": 1.5692084529023814e-05, "loss": 2.3975, "step": 3281000 }, { "epoch": 6.869201527280035, "grad_norm": 18.176956176757812, "learning_rate": 1.5686840272533525e-05, "loss": 2.4032, "step": 3281500 }, { "epoch": 6.8702481830056605, "grad_norm": 17.390701293945312, "learning_rate": 1.568159601604323e-05, "loss": 2.3813, "step": 3282000 }, { "epoch": 6.871294838731286, "grad_norm": 17.286497116088867, "learning_rate": 1.5676351759552937e-05, "loss": 2.4008, "step": 3282500 }, { "epoch": 6.8723414944569114, "grad_norm": 17.774402618408203, "learning_rate": 1.5671107503062644e-05, "loss": 2.4092, "step": 3283000 }, { "epoch": 6.873388150182537, "grad_norm": 15.559433937072754, "learning_rate": 1.5665863246572356e-05, "loss": 2.4168, "step": 3283500 }, { "epoch": 6.874434805908162, "grad_norm": 15.705904006958008, "learning_rate": 1.5660618990082063e-05, "loss": 2.3872, "step": 3284000 }, { "epoch": 6.875481461633788, "grad_norm": 16.13518714904785, "learning_rate": 1.565537473359177e-05, "loss": 2.4111, "step": 3284500 }, { "epoch": 6.876528117359413, "grad_norm": 22.28446388244629, "learning_rate": 1.565013047710148e-05, "loss": 2.4118, "step": 3285000 }, { "epoch": 6.877574773085039, "grad_norm": 22.80486488342285, "learning_rate": 1.5644886220611186e-05, "loss": 2.4065, "step": 3285500 }, { "epoch": 6.878621428810664, "grad_norm": 16.663118362426758, "learning_rate": 1.5639641964120897e-05, "loss": 2.3998, "step": 3286000 }, { "epoch": 6.87966808453629, "grad_norm": 19.605703353881836, "learning_rate": 1.5634397707630605e-05, "loss": 2.381, "step": 3286500 }, { "epoch": 6.880714740261915, "grad_norm": 15.98331356048584, "learning_rate": 1.5629153451140313e-05, "loss": 2.381, "step": 3287000 }, { "epoch": 6.8817613959875406, "grad_norm": 17.201852798461914, "learning_rate": 1.562390919465002e-05, "loss": 2.4034, "step": 3287500 }, { "epoch": 6.882808051713166, "grad_norm": 17.993656158447266, "learning_rate": 1.5618664938159728e-05, "loss": 2.4059, "step": 3288000 }, { "epoch": 6.8838547074387915, "grad_norm": 18.6168212890625, "learning_rate": 1.5613420681669436e-05, "loss": 2.3917, "step": 3288500 }, { "epoch": 6.884901363164417, "grad_norm": 19.31626319885254, "learning_rate": 1.5608176425179143e-05, "loss": 2.4023, "step": 3289000 }, { "epoch": 6.885948018890042, "grad_norm": 14.778654098510742, "learning_rate": 1.560293216868885e-05, "loss": 2.3974, "step": 3289500 }, { "epoch": 6.886994674615668, "grad_norm": 17.45062255859375, "learning_rate": 1.5597687912198562e-05, "loss": 2.4203, "step": 3290000 }, { "epoch": 6.888041330341293, "grad_norm": 18.442935943603516, "learning_rate": 1.559244365570827e-05, "loss": 2.3942, "step": 3290500 }, { "epoch": 6.889087986066919, "grad_norm": 21.612064361572266, "learning_rate": 1.5587199399217978e-05, "loss": 2.3948, "step": 3291000 }, { "epoch": 6.890134641792544, "grad_norm": 17.427574157714844, "learning_rate": 1.5581955142727685e-05, "loss": 2.4137, "step": 3291500 }, { "epoch": 6.89118129751817, "grad_norm": 16.461376190185547, "learning_rate": 1.5576710886237393e-05, "loss": 2.4019, "step": 3292000 }, { "epoch": 6.892227953243795, "grad_norm": 18.480281829833984, "learning_rate": 1.5571466629747104e-05, "loss": 2.4046, "step": 3292500 }, { "epoch": 6.893274608969421, "grad_norm": 16.375295639038086, "learning_rate": 1.556622237325681e-05, "loss": 2.413, "step": 3293000 }, { "epoch": 6.894321264695046, "grad_norm": 14.423269271850586, "learning_rate": 1.5560978116766516e-05, "loss": 2.3975, "step": 3293500 }, { "epoch": 6.8953679204206715, "grad_norm": 19.81048583984375, "learning_rate": 1.5555733860276224e-05, "loss": 2.4136, "step": 3294000 }, { "epoch": 6.896414576146297, "grad_norm": 17.27353286743164, "learning_rate": 1.5550489603785935e-05, "loss": 2.39, "step": 3294500 }, { "epoch": 6.897461231871922, "grad_norm": 16.950300216674805, "learning_rate": 1.5545245347295643e-05, "loss": 2.4065, "step": 3295000 }, { "epoch": 6.898507887597548, "grad_norm": 15.772186279296875, "learning_rate": 1.554000109080535e-05, "loss": 2.3989, "step": 3295500 }, { "epoch": 6.899554543323173, "grad_norm": 19.4909725189209, "learning_rate": 1.5534756834315058e-05, "loss": 2.414, "step": 3296000 }, { "epoch": 6.9006011990488, "grad_norm": 15.042022705078125, "learning_rate": 1.552951257782477e-05, "loss": 2.4174, "step": 3296500 }, { "epoch": 6.901647854774425, "grad_norm": 19.043413162231445, "learning_rate": 1.5524268321334477e-05, "loss": 2.3966, "step": 3297000 }, { "epoch": 6.902694510500051, "grad_norm": 17.18031120300293, "learning_rate": 1.5519024064844184e-05, "loss": 2.3838, "step": 3297500 }, { "epoch": 6.903741166225676, "grad_norm": 16.055706024169922, "learning_rate": 1.5513779808353892e-05, "loss": 2.3787, "step": 3298000 }, { "epoch": 6.9047878219513015, "grad_norm": 15.452280044555664, "learning_rate": 1.55085355518636e-05, "loss": 2.3913, "step": 3298500 }, { "epoch": 6.905834477676927, "grad_norm": 16.908111572265625, "learning_rate": 1.5503291295373307e-05, "loss": 2.3945, "step": 3299000 }, { "epoch": 6.906881133402552, "grad_norm": 15.143515586853027, "learning_rate": 1.5498047038883015e-05, "loss": 2.4055, "step": 3299500 }, { "epoch": 6.907927789128178, "grad_norm": 21.132551193237305, "learning_rate": 1.5492802782392723e-05, "loss": 2.4019, "step": 3300000 }, { "epoch": 6.908974444853803, "grad_norm": 19.201766967773438, "learning_rate": 1.548755852590243e-05, "loss": 2.4082, "step": 3300500 }, { "epoch": 6.910021100579429, "grad_norm": 18.27129554748535, "learning_rate": 1.548231426941214e-05, "loss": 2.412, "step": 3301000 }, { "epoch": 6.911067756305054, "grad_norm": 17.304771423339844, "learning_rate": 1.547707001292185e-05, "loss": 2.4089, "step": 3301500 }, { "epoch": 6.91211441203068, "grad_norm": 15.748130798339844, "learning_rate": 1.5471825756431557e-05, "loss": 2.4161, "step": 3302000 }, { "epoch": 6.913161067756305, "grad_norm": 17.56981086730957, "learning_rate": 1.5466581499941265e-05, "loss": 2.388, "step": 3302500 }, { "epoch": 6.914207723481931, "grad_norm": 15.546467781066895, "learning_rate": 1.5461337243450972e-05, "loss": 2.3949, "step": 3303000 }, { "epoch": 6.915254379207556, "grad_norm": 20.47332000732422, "learning_rate": 1.5456092986960683e-05, "loss": 2.3834, "step": 3303500 }, { "epoch": 6.9163010349331815, "grad_norm": 16.27126693725586, "learning_rate": 1.545084873047039e-05, "loss": 2.3656, "step": 3304000 }, { "epoch": 6.917347690658807, "grad_norm": 15.963875770568848, "learning_rate": 1.5445604473980095e-05, "loss": 2.3758, "step": 3304500 }, { "epoch": 6.918394346384432, "grad_norm": 14.661938667297363, "learning_rate": 1.5440360217489807e-05, "loss": 2.4113, "step": 3305000 }, { "epoch": 6.919441002110058, "grad_norm": 16.937334060668945, "learning_rate": 1.5435115960999514e-05, "loss": 2.4112, "step": 3305500 }, { "epoch": 6.920487657835683, "grad_norm": 18.968032836914062, "learning_rate": 1.5429871704509222e-05, "loss": 2.4122, "step": 3306000 }, { "epoch": 6.921534313561309, "grad_norm": 19.71728515625, "learning_rate": 1.542462744801893e-05, "loss": 2.3969, "step": 3306500 }, { "epoch": 6.922580969286934, "grad_norm": 19.912153244018555, "learning_rate": 1.5419383191528637e-05, "loss": 2.4151, "step": 3307000 }, { "epoch": 6.92362762501256, "grad_norm": 16.69768524169922, "learning_rate": 1.541413893503835e-05, "loss": 2.4026, "step": 3307500 }, { "epoch": 6.924674280738185, "grad_norm": 17.10674476623535, "learning_rate": 1.5408894678548056e-05, "loss": 2.3943, "step": 3308000 }, { "epoch": 6.925720936463811, "grad_norm": 17.050495147705078, "learning_rate": 1.5403650422057764e-05, "loss": 2.4011, "step": 3308500 }, { "epoch": 6.926767592189436, "grad_norm": 19.563438415527344, "learning_rate": 1.539840616556747e-05, "loss": 2.3937, "step": 3309000 }, { "epoch": 6.9278142479150615, "grad_norm": 16.672447204589844, "learning_rate": 1.539316190907718e-05, "loss": 2.3869, "step": 3309500 }, { "epoch": 6.928860903640687, "grad_norm": 14.82236099243164, "learning_rate": 1.5387917652586887e-05, "loss": 2.3913, "step": 3310000 }, { "epoch": 6.929907559366312, "grad_norm": 17.672443389892578, "learning_rate": 1.5382673396096595e-05, "loss": 2.4221, "step": 3310500 }, { "epoch": 6.930954215091938, "grad_norm": 20.2020320892334, "learning_rate": 1.5377429139606302e-05, "loss": 2.4011, "step": 3311000 }, { "epoch": 6.932000870817564, "grad_norm": 20.760013580322266, "learning_rate": 1.537218488311601e-05, "loss": 2.3828, "step": 3311500 }, { "epoch": 6.93304752654319, "grad_norm": 16.72528648376465, "learning_rate": 1.536694062662572e-05, "loss": 2.4044, "step": 3312000 }, { "epoch": 6.934094182268815, "grad_norm": 19.7445125579834, "learning_rate": 1.536169637013543e-05, "loss": 2.374, "step": 3312500 }, { "epoch": 6.935140837994441, "grad_norm": 18.542530059814453, "learning_rate": 1.5356452113645136e-05, "loss": 2.3692, "step": 3313000 }, { "epoch": 6.936187493720066, "grad_norm": 19.696142196655273, "learning_rate": 1.5351207857154844e-05, "loss": 2.3989, "step": 3313500 }, { "epoch": 6.9372341494456915, "grad_norm": 20.5340518951416, "learning_rate": 1.5345963600664555e-05, "loss": 2.3943, "step": 3314000 }, { "epoch": 6.938280805171317, "grad_norm": 17.73616600036621, "learning_rate": 1.5340719344174263e-05, "loss": 2.4038, "step": 3314500 }, { "epoch": 6.939327460896942, "grad_norm": 16.714439392089844, "learning_rate": 1.533547508768397e-05, "loss": 2.4022, "step": 3315000 }, { "epoch": 6.940374116622568, "grad_norm": 18.637527465820312, "learning_rate": 1.5330230831193678e-05, "loss": 2.3891, "step": 3315500 }, { "epoch": 6.941420772348193, "grad_norm": 16.560503005981445, "learning_rate": 1.5324986574703386e-05, "loss": 2.3878, "step": 3316000 }, { "epoch": 6.942467428073819, "grad_norm": 21.28580665588379, "learning_rate": 1.5319742318213094e-05, "loss": 2.4088, "step": 3316500 }, { "epoch": 6.943514083799444, "grad_norm": 18.19005584716797, "learning_rate": 1.53144980617228e-05, "loss": 2.3729, "step": 3317000 }, { "epoch": 6.94456073952507, "grad_norm": 17.315677642822266, "learning_rate": 1.530925380523251e-05, "loss": 2.4033, "step": 3317500 }, { "epoch": 6.945607395250695, "grad_norm": 18.004756927490234, "learning_rate": 1.5304009548742217e-05, "loss": 2.3949, "step": 3318000 }, { "epoch": 6.946654050976321, "grad_norm": 18.683612823486328, "learning_rate": 1.5298765292251928e-05, "loss": 2.404, "step": 3318500 }, { "epoch": 6.947700706701946, "grad_norm": 24.550596237182617, "learning_rate": 1.5293521035761635e-05, "loss": 2.3822, "step": 3319000 }, { "epoch": 6.9487473624275715, "grad_norm": 16.48596954345703, "learning_rate": 1.5288276779271343e-05, "loss": 2.4076, "step": 3319500 }, { "epoch": 6.949794018153197, "grad_norm": 17.196269989013672, "learning_rate": 1.528303252278105e-05, "loss": 2.3801, "step": 3320000 }, { "epoch": 6.950840673878822, "grad_norm": 14.997711181640625, "learning_rate": 1.527778826629076e-05, "loss": 2.3888, "step": 3320500 }, { "epoch": 6.951887329604448, "grad_norm": 17.739055633544922, "learning_rate": 1.527254400980047e-05, "loss": 2.4191, "step": 3321000 }, { "epoch": 6.952933985330073, "grad_norm": 22.057369232177734, "learning_rate": 1.5267299753310174e-05, "loss": 2.3927, "step": 3321500 }, { "epoch": 6.953980641055699, "grad_norm": 17.23741340637207, "learning_rate": 1.526205549681988e-05, "loss": 2.41, "step": 3322000 }, { "epoch": 6.955027296781324, "grad_norm": 18.32065773010254, "learning_rate": 1.5256811240329591e-05, "loss": 2.3905, "step": 3322500 }, { "epoch": 6.95607395250695, "grad_norm": 18.42357635498047, "learning_rate": 1.5251566983839299e-05, "loss": 2.3815, "step": 3323000 }, { "epoch": 6.957120608232575, "grad_norm": 20.1727294921875, "learning_rate": 1.5246322727349008e-05, "loss": 2.3938, "step": 3323500 }, { "epoch": 6.958167263958201, "grad_norm": 18.70794677734375, "learning_rate": 1.5241078470858716e-05, "loss": 2.3819, "step": 3324000 }, { "epoch": 6.959213919683826, "grad_norm": 14.317422866821289, "learning_rate": 1.5235834214368425e-05, "loss": 2.3847, "step": 3324500 }, { "epoch": 6.9602605754094515, "grad_norm": 16.604721069335938, "learning_rate": 1.5230589957878133e-05, "loss": 2.385, "step": 3325000 }, { "epoch": 6.961307231135077, "grad_norm": 19.74515151977539, "learning_rate": 1.5225345701387842e-05, "loss": 2.3699, "step": 3325500 }, { "epoch": 6.9623538868607024, "grad_norm": 18.475675582885742, "learning_rate": 1.522010144489755e-05, "loss": 2.4136, "step": 3326000 }, { "epoch": 6.963400542586328, "grad_norm": 17.496315002441406, "learning_rate": 1.5214857188407258e-05, "loss": 2.3927, "step": 3326500 }, { "epoch": 6.964447198311953, "grad_norm": 16.87066650390625, "learning_rate": 1.5209612931916964e-05, "loss": 2.3731, "step": 3327000 }, { "epoch": 6.965493854037579, "grad_norm": 19.211917877197266, "learning_rate": 1.5204368675426673e-05, "loss": 2.3917, "step": 3327500 }, { "epoch": 6.966540509763204, "grad_norm": 17.21869468688965, "learning_rate": 1.519912441893638e-05, "loss": 2.3801, "step": 3328000 }, { "epoch": 6.96758716548883, "grad_norm": 18.203580856323242, "learning_rate": 1.5193880162446088e-05, "loss": 2.3814, "step": 3328500 }, { "epoch": 6.968633821214455, "grad_norm": 17.377838134765625, "learning_rate": 1.5188635905955798e-05, "loss": 2.4074, "step": 3329000 }, { "epoch": 6.969680476940081, "grad_norm": 18.856422424316406, "learning_rate": 1.5183391649465505e-05, "loss": 2.4086, "step": 3329500 }, { "epoch": 6.970727132665706, "grad_norm": 20.297876358032227, "learning_rate": 1.5178147392975215e-05, "loss": 2.4189, "step": 3330000 }, { "epoch": 6.9717737883913315, "grad_norm": 17.535554885864258, "learning_rate": 1.5172903136484923e-05, "loss": 2.417, "step": 3330500 }, { "epoch": 6.972820444116957, "grad_norm": 19.473230361938477, "learning_rate": 1.5167658879994632e-05, "loss": 2.3821, "step": 3331000 }, { "epoch": 6.973867099842583, "grad_norm": 15.712959289550781, "learning_rate": 1.516241462350434e-05, "loss": 2.3792, "step": 3331500 }, { "epoch": 6.974913755568209, "grad_norm": 16.26398277282715, "learning_rate": 1.5157170367014047e-05, "loss": 2.4002, "step": 3332000 }, { "epoch": 6.975960411293834, "grad_norm": 19.009475708007812, "learning_rate": 1.5151926110523757e-05, "loss": 2.3901, "step": 3332500 }, { "epoch": 6.97700706701946, "grad_norm": 19.870058059692383, "learning_rate": 1.5146681854033463e-05, "loss": 2.4076, "step": 3333000 }, { "epoch": 6.978053722745085, "grad_norm": 18.497154235839844, "learning_rate": 1.514143759754317e-05, "loss": 2.3841, "step": 3333500 }, { "epoch": 6.979100378470711, "grad_norm": 19.8924503326416, "learning_rate": 1.5136193341052878e-05, "loss": 2.3943, "step": 3334000 }, { "epoch": 6.980147034196336, "grad_norm": 15.098669052124023, "learning_rate": 1.5130949084562587e-05, "loss": 2.4035, "step": 3334500 }, { "epoch": 6.9811936899219615, "grad_norm": 16.797128677368164, "learning_rate": 1.5125704828072295e-05, "loss": 2.4043, "step": 3335000 }, { "epoch": 6.982240345647587, "grad_norm": 15.77901840209961, "learning_rate": 1.5120460571582005e-05, "loss": 2.3882, "step": 3335500 }, { "epoch": 6.9832870013732125, "grad_norm": 15.629432678222656, "learning_rate": 1.5115216315091712e-05, "loss": 2.3977, "step": 3336000 }, { "epoch": 6.984333657098838, "grad_norm": 17.831111907958984, "learning_rate": 1.5109972058601422e-05, "loss": 2.3973, "step": 3336500 }, { "epoch": 6.985380312824463, "grad_norm": 19.27949333190918, "learning_rate": 1.510472780211113e-05, "loss": 2.3881, "step": 3337000 }, { "epoch": 6.986426968550089, "grad_norm": 15.746010780334473, "learning_rate": 1.5099483545620837e-05, "loss": 2.3948, "step": 3337500 }, { "epoch": 6.987473624275714, "grad_norm": 21.886384963989258, "learning_rate": 1.5094239289130546e-05, "loss": 2.3912, "step": 3338000 }, { "epoch": 6.98852028000134, "grad_norm": 16.572265625, "learning_rate": 1.5088995032640252e-05, "loss": 2.4021, "step": 3338500 }, { "epoch": 6.989566935726965, "grad_norm": 19.150373458862305, "learning_rate": 1.508375077614996e-05, "loss": 2.4045, "step": 3339000 }, { "epoch": 6.990613591452591, "grad_norm": 20.047443389892578, "learning_rate": 1.507850651965967e-05, "loss": 2.4069, "step": 3339500 }, { "epoch": 6.991660247178216, "grad_norm": 16.80314826965332, "learning_rate": 1.5073262263169377e-05, "loss": 2.3831, "step": 3340000 }, { "epoch": 6.992706902903842, "grad_norm": 17.855518341064453, "learning_rate": 1.5068018006679085e-05, "loss": 2.3888, "step": 3340500 }, { "epoch": 6.993753558629467, "grad_norm": 17.53550910949707, "learning_rate": 1.5062773750188794e-05, "loss": 2.4022, "step": 3341000 }, { "epoch": 6.9948002143550925, "grad_norm": 16.844341278076172, "learning_rate": 1.5057529493698502e-05, "loss": 2.4006, "step": 3341500 }, { "epoch": 6.995846870080718, "grad_norm": 15.789566040039062, "learning_rate": 1.5052285237208211e-05, "loss": 2.4028, "step": 3342000 }, { "epoch": 6.996893525806343, "grad_norm": 18.399494171142578, "learning_rate": 1.5047040980717919e-05, "loss": 2.3915, "step": 3342500 }, { "epoch": 6.997940181531969, "grad_norm": 18.586015701293945, "learning_rate": 1.5041796724227628e-05, "loss": 2.4019, "step": 3343000 }, { "epoch": 6.998986837257594, "grad_norm": 21.36089515686035, "learning_rate": 1.5036552467737336e-05, "loss": 2.3995, "step": 3343500 }, { "epoch": 7.00003349298322, "grad_norm": 18.72563934326172, "learning_rate": 1.5031308211247042e-05, "loss": 2.402, "step": 3344000 }, { "epoch": 7.001080148708845, "grad_norm": 17.92380142211914, "learning_rate": 1.502606395475675e-05, "loss": 2.3792, "step": 3344500 }, { "epoch": 7.002126804434471, "grad_norm": 16.866281509399414, "learning_rate": 1.5020819698266459e-05, "loss": 2.3877, "step": 3345000 }, { "epoch": 7.003173460160096, "grad_norm": 17.492834091186523, "learning_rate": 1.5015575441776167e-05, "loss": 2.3889, "step": 3345500 }, { "epoch": 7.004220115885722, "grad_norm": 17.511465072631836, "learning_rate": 1.5010331185285875e-05, "loss": 2.3881, "step": 3346000 }, { "epoch": 7.005266771611347, "grad_norm": 17.186670303344727, "learning_rate": 1.5005086928795584e-05, "loss": 2.3674, "step": 3346500 }, { "epoch": 7.0063134273369725, "grad_norm": 18.150135040283203, "learning_rate": 1.4999842672305292e-05, "loss": 2.3772, "step": 3347000 }, { "epoch": 7.007360083062598, "grad_norm": 17.37529754638672, "learning_rate": 1.4994598415815001e-05, "loss": 2.404, "step": 3347500 }, { "epoch": 7.008406738788224, "grad_norm": 16.314189910888672, "learning_rate": 1.4989354159324709e-05, "loss": 2.3638, "step": 3348000 }, { "epoch": 7.00945339451385, "grad_norm": 21.240724563598633, "learning_rate": 1.4984109902834418e-05, "loss": 2.3897, "step": 3348500 }, { "epoch": 7.010500050239475, "grad_norm": 20.672710418701172, "learning_rate": 1.4978865646344126e-05, "loss": 2.3869, "step": 3349000 }, { "epoch": 7.011546705965101, "grad_norm": 21.96749496459961, "learning_rate": 1.4973621389853833e-05, "loss": 2.3847, "step": 3349500 }, { "epoch": 7.012593361690726, "grad_norm": 15.403396606445312, "learning_rate": 1.496837713336354e-05, "loss": 2.3892, "step": 3350000 }, { "epoch": 7.013640017416352, "grad_norm": 16.397994995117188, "learning_rate": 1.4963132876873249e-05, "loss": 2.3957, "step": 3350500 }, { "epoch": 7.014686673141977, "grad_norm": 18.903425216674805, "learning_rate": 1.4957888620382956e-05, "loss": 2.389, "step": 3351000 }, { "epoch": 7.0157333288676025, "grad_norm": 20.28007698059082, "learning_rate": 1.4952644363892664e-05, "loss": 2.3795, "step": 3351500 }, { "epoch": 7.016779984593228, "grad_norm": 16.01515007019043, "learning_rate": 1.4947400107402374e-05, "loss": 2.3909, "step": 3352000 }, { "epoch": 7.017826640318853, "grad_norm": 16.66437530517578, "learning_rate": 1.4942155850912081e-05, "loss": 2.3885, "step": 3352500 }, { "epoch": 7.018873296044479, "grad_norm": 15.176904678344727, "learning_rate": 1.493691159442179e-05, "loss": 2.3784, "step": 3353000 }, { "epoch": 7.019919951770104, "grad_norm": 20.832347869873047, "learning_rate": 1.4931667337931498e-05, "loss": 2.394, "step": 3353500 }, { "epoch": 7.02096660749573, "grad_norm": 14.343689918518066, "learning_rate": 1.4926423081441208e-05, "loss": 2.405, "step": 3354000 }, { "epoch": 7.022013263221355, "grad_norm": 15.762131690979004, "learning_rate": 1.4921178824950915e-05, "loss": 2.3904, "step": 3354500 }, { "epoch": 7.023059918946981, "grad_norm": 15.380992889404297, "learning_rate": 1.4915934568460623e-05, "loss": 2.3755, "step": 3355000 }, { "epoch": 7.024106574672606, "grad_norm": 18.805768966674805, "learning_rate": 1.4910690311970329e-05, "loss": 2.3725, "step": 3355500 }, { "epoch": 7.025153230398232, "grad_norm": 17.697837829589844, "learning_rate": 1.4905446055480038e-05, "loss": 2.3987, "step": 3356000 }, { "epoch": 7.026199886123857, "grad_norm": 16.43775749206543, "learning_rate": 1.4900201798989746e-05, "loss": 2.3847, "step": 3356500 }, { "epoch": 7.0272465418494825, "grad_norm": 18.834014892578125, "learning_rate": 1.4894957542499456e-05, "loss": 2.3797, "step": 3357000 }, { "epoch": 7.028293197575108, "grad_norm": 16.9306697845459, "learning_rate": 1.4889713286009163e-05, "loss": 2.3957, "step": 3357500 }, { "epoch": 7.029339853300733, "grad_norm": 17.630857467651367, "learning_rate": 1.4884469029518871e-05, "loss": 2.3837, "step": 3358000 }, { "epoch": 7.030386509026359, "grad_norm": 15.878568649291992, "learning_rate": 1.487922477302858e-05, "loss": 2.376, "step": 3358500 }, { "epoch": 7.031433164751984, "grad_norm": 19.983089447021484, "learning_rate": 1.4873980516538288e-05, "loss": 2.388, "step": 3359000 }, { "epoch": 7.03247982047761, "grad_norm": 18.224536895751953, "learning_rate": 1.4868736260047997e-05, "loss": 2.3761, "step": 3359500 }, { "epoch": 7.033526476203235, "grad_norm": 16.0930118560791, "learning_rate": 1.4863492003557705e-05, "loss": 2.4024, "step": 3360000 }, { "epoch": 7.034573131928861, "grad_norm": 17.403533935546875, "learning_rate": 1.4858247747067413e-05, "loss": 2.3877, "step": 3360500 }, { "epoch": 7.035619787654486, "grad_norm": 15.652461051940918, "learning_rate": 1.4853003490577119e-05, "loss": 2.3838, "step": 3361000 }, { "epoch": 7.036666443380112, "grad_norm": 19.950057983398438, "learning_rate": 1.4847759234086828e-05, "loss": 2.3919, "step": 3361500 }, { "epoch": 7.037713099105737, "grad_norm": 16.819303512573242, "learning_rate": 1.4842514977596536e-05, "loss": 2.4022, "step": 3362000 }, { "epoch": 7.0387597548313625, "grad_norm": 19.66718101501465, "learning_rate": 1.4837270721106245e-05, "loss": 2.3971, "step": 3362500 }, { "epoch": 7.039806410556988, "grad_norm": 16.84881019592285, "learning_rate": 1.4832026464615953e-05, "loss": 2.3967, "step": 3363000 }, { "epoch": 7.040853066282613, "grad_norm": 17.495609283447266, "learning_rate": 1.482678220812566e-05, "loss": 2.3905, "step": 3363500 }, { "epoch": 7.041899722008239, "grad_norm": 18.11773681640625, "learning_rate": 1.482153795163537e-05, "loss": 2.3849, "step": 3364000 }, { "epoch": 7.042946377733864, "grad_norm": 18.855937957763672, "learning_rate": 1.4816293695145078e-05, "loss": 2.4044, "step": 3364500 }, { "epoch": 7.04399303345949, "grad_norm": 17.514314651489258, "learning_rate": 1.4811049438654787e-05, "loss": 2.3869, "step": 3365000 }, { "epoch": 7.045039689185116, "grad_norm": 16.239334106445312, "learning_rate": 1.4805805182164495e-05, "loss": 2.3824, "step": 3365500 }, { "epoch": 7.046086344910742, "grad_norm": 21.06549835205078, "learning_rate": 1.4800560925674204e-05, "loss": 2.3738, "step": 3366000 }, { "epoch": 7.047133000636367, "grad_norm": 16.877389907836914, "learning_rate": 1.4795316669183912e-05, "loss": 2.388, "step": 3366500 }, { "epoch": 7.0481796563619925, "grad_norm": 17.210308074951172, "learning_rate": 1.4790072412693618e-05, "loss": 2.4, "step": 3367000 }, { "epoch": 7.049226312087618, "grad_norm": 17.07335090637207, "learning_rate": 1.4784828156203326e-05, "loss": 2.3788, "step": 3367500 }, { "epoch": 7.050272967813243, "grad_norm": 17.261653900146484, "learning_rate": 1.4779583899713035e-05, "loss": 2.3923, "step": 3368000 }, { "epoch": 7.051319623538869, "grad_norm": 18.284236907958984, "learning_rate": 1.4774339643222743e-05, "loss": 2.3839, "step": 3368500 }, { "epoch": 7.052366279264494, "grad_norm": 17.256685256958008, "learning_rate": 1.476909538673245e-05, "loss": 2.3782, "step": 3369000 }, { "epoch": 7.05341293499012, "grad_norm": 16.821407318115234, "learning_rate": 1.476385113024216e-05, "loss": 2.3741, "step": 3369500 }, { "epoch": 7.054459590715745, "grad_norm": 17.831012725830078, "learning_rate": 1.4758606873751867e-05, "loss": 2.3826, "step": 3370000 }, { "epoch": 7.055506246441371, "grad_norm": 14.602365493774414, "learning_rate": 1.4753362617261577e-05, "loss": 2.3793, "step": 3370500 }, { "epoch": 7.056552902166996, "grad_norm": 17.61505699157715, "learning_rate": 1.4748118360771284e-05, "loss": 2.4003, "step": 3371000 }, { "epoch": 7.057599557892622, "grad_norm": 17.434589385986328, "learning_rate": 1.4742874104280994e-05, "loss": 2.3634, "step": 3371500 }, { "epoch": 7.058646213618247, "grad_norm": 17.371305465698242, "learning_rate": 1.4737629847790702e-05, "loss": 2.3938, "step": 3372000 }, { "epoch": 7.0596928693438725, "grad_norm": 16.87921905517578, "learning_rate": 1.4732385591300408e-05, "loss": 2.3968, "step": 3372500 }, { "epoch": 7.060739525069498, "grad_norm": 16.478239059448242, "learning_rate": 1.4727141334810115e-05, "loss": 2.3978, "step": 3373000 }, { "epoch": 7.061786180795123, "grad_norm": 19.292808532714844, "learning_rate": 1.4721897078319825e-05, "loss": 2.3941, "step": 3373500 }, { "epoch": 7.062832836520749, "grad_norm": 23.838266372680664, "learning_rate": 1.4716652821829532e-05, "loss": 2.3846, "step": 3374000 }, { "epoch": 7.063879492246374, "grad_norm": 16.420352935791016, "learning_rate": 1.471140856533924e-05, "loss": 2.398, "step": 3374500 }, { "epoch": 7.064926147972, "grad_norm": 18.387727737426758, "learning_rate": 1.470616430884895e-05, "loss": 2.3909, "step": 3375000 }, { "epoch": 7.065972803697625, "grad_norm": 16.36798858642578, "learning_rate": 1.4700920052358657e-05, "loss": 2.4083, "step": 3375500 }, { "epoch": 7.067019459423251, "grad_norm": 18.039005279541016, "learning_rate": 1.4695675795868366e-05, "loss": 2.3935, "step": 3376000 }, { "epoch": 7.068066115148876, "grad_norm": 18.389873504638672, "learning_rate": 1.4690431539378074e-05, "loss": 2.3776, "step": 3376500 }, { "epoch": 7.069112770874502, "grad_norm": 16.07623291015625, "learning_rate": 1.4685187282887784e-05, "loss": 2.3848, "step": 3377000 }, { "epoch": 7.070159426600127, "grad_norm": 13.754815101623535, "learning_rate": 1.4679943026397491e-05, "loss": 2.3921, "step": 3377500 }, { "epoch": 7.0712060823257525, "grad_norm": 18.00825309753418, "learning_rate": 1.4674698769907199e-05, "loss": 2.4067, "step": 3378000 }, { "epoch": 7.072252738051378, "grad_norm": 16.49422836303711, "learning_rate": 1.4669454513416905e-05, "loss": 2.3668, "step": 3378500 }, { "epoch": 7.0732993937770035, "grad_norm": 15.275202751159668, "learning_rate": 1.4664210256926614e-05, "loss": 2.3988, "step": 3379000 }, { "epoch": 7.074346049502629, "grad_norm": 18.30266761779785, "learning_rate": 1.4658966000436322e-05, "loss": 2.3883, "step": 3379500 }, { "epoch": 7.075392705228254, "grad_norm": 18.88425064086914, "learning_rate": 1.4653721743946031e-05, "loss": 2.3689, "step": 3380000 }, { "epoch": 7.07643936095388, "grad_norm": 16.669038772583008, "learning_rate": 1.4648477487455739e-05, "loss": 2.3738, "step": 3380500 }, { "epoch": 7.077486016679505, "grad_norm": 18.601625442504883, "learning_rate": 1.4643233230965447e-05, "loss": 2.3899, "step": 3381000 }, { "epoch": 7.078532672405131, "grad_norm": 15.798415184020996, "learning_rate": 1.4637988974475156e-05, "loss": 2.373, "step": 3381500 }, { "epoch": 7.079579328130756, "grad_norm": 18.516603469848633, "learning_rate": 1.4632744717984864e-05, "loss": 2.3831, "step": 3382000 }, { "epoch": 7.0806259838563825, "grad_norm": 18.052637100219727, "learning_rate": 1.4627500461494573e-05, "loss": 2.3881, "step": 3382500 }, { "epoch": 7.081672639582008, "grad_norm": 14.730730056762695, "learning_rate": 1.4622256205004281e-05, "loss": 2.38, "step": 3383000 }, { "epoch": 7.0827192953076334, "grad_norm": 14.669522285461426, "learning_rate": 1.461701194851399e-05, "loss": 2.4012, "step": 3383500 }, { "epoch": 7.083765951033259, "grad_norm": 16.94715690612793, "learning_rate": 1.4611767692023695e-05, "loss": 2.3734, "step": 3384000 }, { "epoch": 7.084812606758884, "grad_norm": 16.27655029296875, "learning_rate": 1.4606523435533404e-05, "loss": 2.3736, "step": 3384500 }, { "epoch": 7.08585926248451, "grad_norm": 23.309268951416016, "learning_rate": 1.4601279179043112e-05, "loss": 2.3727, "step": 3385000 }, { "epoch": 7.086905918210135, "grad_norm": 16.949951171875, "learning_rate": 1.4596034922552821e-05, "loss": 2.386, "step": 3385500 }, { "epoch": 7.087952573935761, "grad_norm": 16.32318115234375, "learning_rate": 1.4590790666062529e-05, "loss": 2.3892, "step": 3386000 }, { "epoch": 7.088999229661386, "grad_norm": 19.21217155456543, "learning_rate": 1.4585546409572236e-05, "loss": 2.3829, "step": 3386500 }, { "epoch": 7.090045885387012, "grad_norm": 16.459531784057617, "learning_rate": 1.4580302153081946e-05, "loss": 2.3898, "step": 3387000 }, { "epoch": 7.091092541112637, "grad_norm": 13.818009376525879, "learning_rate": 1.4575057896591654e-05, "loss": 2.3722, "step": 3387500 }, { "epoch": 7.0921391968382625, "grad_norm": 19.03297996520996, "learning_rate": 1.4569813640101363e-05, "loss": 2.3821, "step": 3388000 }, { "epoch": 7.093185852563888, "grad_norm": 17.15863800048828, "learning_rate": 1.456456938361107e-05, "loss": 2.4045, "step": 3388500 }, { "epoch": 7.0942325082895135, "grad_norm": 18.966833114624023, "learning_rate": 1.455932512712078e-05, "loss": 2.3862, "step": 3389000 }, { "epoch": 7.095279164015139, "grad_norm": 15.785225868225098, "learning_rate": 1.4554080870630484e-05, "loss": 2.3952, "step": 3389500 }, { "epoch": 7.096325819740764, "grad_norm": 16.927961349487305, "learning_rate": 1.4548836614140194e-05, "loss": 2.3872, "step": 3390000 }, { "epoch": 7.09737247546639, "grad_norm": 21.50270652770996, "learning_rate": 1.4543592357649901e-05, "loss": 2.3683, "step": 3390500 }, { "epoch": 7.098419131192015, "grad_norm": 19.90280532836914, "learning_rate": 1.453834810115961e-05, "loss": 2.3716, "step": 3391000 }, { "epoch": 7.099465786917641, "grad_norm": 20.199594497680664, "learning_rate": 1.4533103844669318e-05, "loss": 2.3704, "step": 3391500 }, { "epoch": 7.100512442643266, "grad_norm": 22.25794792175293, "learning_rate": 1.4527859588179026e-05, "loss": 2.3622, "step": 3392000 }, { "epoch": 7.101559098368892, "grad_norm": 14.0731840133667, "learning_rate": 1.4522615331688736e-05, "loss": 2.3842, "step": 3392500 }, { "epoch": 7.102605754094517, "grad_norm": 17.076826095581055, "learning_rate": 1.4517371075198443e-05, "loss": 2.4025, "step": 3393000 }, { "epoch": 7.103652409820143, "grad_norm": 17.706539154052734, "learning_rate": 1.4512126818708153e-05, "loss": 2.3862, "step": 3393500 }, { "epoch": 7.104699065545768, "grad_norm": 22.407747268676758, "learning_rate": 1.450688256221786e-05, "loss": 2.4018, "step": 3394000 }, { "epoch": 7.1057457212713935, "grad_norm": 18.953815460205078, "learning_rate": 1.450163830572757e-05, "loss": 2.3842, "step": 3394500 }, { "epoch": 7.106792376997019, "grad_norm": 32.97822570800781, "learning_rate": 1.4496394049237277e-05, "loss": 2.4069, "step": 3395000 }, { "epoch": 7.107839032722644, "grad_norm": 15.773099899291992, "learning_rate": 1.4491149792746983e-05, "loss": 2.3925, "step": 3395500 }, { "epoch": 7.10888568844827, "grad_norm": 17.76837158203125, "learning_rate": 1.4485905536256691e-05, "loss": 2.3953, "step": 3396000 }, { "epoch": 7.109932344173895, "grad_norm": 17.906042098999023, "learning_rate": 1.44806612797664e-05, "loss": 2.3936, "step": 3396500 }, { "epoch": 7.110978999899521, "grad_norm": 15.497894287109375, "learning_rate": 1.4475417023276108e-05, "loss": 2.3875, "step": 3397000 }, { "epoch": 7.112025655625146, "grad_norm": 15.601967811584473, "learning_rate": 1.4470172766785818e-05, "loss": 2.3962, "step": 3397500 }, { "epoch": 7.113072311350772, "grad_norm": 16.537334442138672, "learning_rate": 1.4464928510295525e-05, "loss": 2.3989, "step": 3398000 }, { "epoch": 7.114118967076397, "grad_norm": 21.020959854125977, "learning_rate": 1.4459684253805233e-05, "loss": 2.3937, "step": 3398500 }, { "epoch": 7.115165622802023, "grad_norm": 17.287616729736328, "learning_rate": 1.4454439997314942e-05, "loss": 2.3729, "step": 3399000 }, { "epoch": 7.116212278527648, "grad_norm": 18.84701919555664, "learning_rate": 1.444919574082465e-05, "loss": 2.3778, "step": 3399500 }, { "epoch": 7.1172589342532735, "grad_norm": 17.85919952392578, "learning_rate": 1.444395148433436e-05, "loss": 2.3757, "step": 3400000 }, { "epoch": 7.1183055899789, "grad_norm": 19.861238479614258, "learning_rate": 1.4438707227844067e-05, "loss": 2.4001, "step": 3400500 }, { "epoch": 7.119352245704525, "grad_norm": 16.52490997314453, "learning_rate": 1.4433462971353773e-05, "loss": 2.3994, "step": 3401000 }, { "epoch": 7.120398901430151, "grad_norm": 17.35601806640625, "learning_rate": 1.442821871486348e-05, "loss": 2.3874, "step": 3401500 }, { "epoch": 7.121445557155776, "grad_norm": 16.20722007751465, "learning_rate": 1.442297445837319e-05, "loss": 2.3801, "step": 3402000 }, { "epoch": 7.122492212881402, "grad_norm": 21.97520637512207, "learning_rate": 1.4417730201882898e-05, "loss": 2.3873, "step": 3402500 }, { "epoch": 7.123538868607027, "grad_norm": 16.305524826049805, "learning_rate": 1.4412485945392607e-05, "loss": 2.3763, "step": 3403000 }, { "epoch": 7.124585524332653, "grad_norm": 19.261707305908203, "learning_rate": 1.4407241688902315e-05, "loss": 2.3723, "step": 3403500 }, { "epoch": 7.125632180058278, "grad_norm": 20.562204360961914, "learning_rate": 1.4401997432412023e-05, "loss": 2.388, "step": 3404000 }, { "epoch": 7.1266788357839035, "grad_norm": 18.85658836364746, "learning_rate": 1.4396753175921732e-05, "loss": 2.3843, "step": 3404500 }, { "epoch": 7.127725491509529, "grad_norm": 18.788663864135742, "learning_rate": 1.439150891943144e-05, "loss": 2.3925, "step": 3405000 }, { "epoch": 7.128772147235154, "grad_norm": 17.334909439086914, "learning_rate": 1.4386264662941149e-05, "loss": 2.3939, "step": 3405500 }, { "epoch": 7.12981880296078, "grad_norm": 16.80286407470703, "learning_rate": 1.4381020406450857e-05, "loss": 2.3889, "step": 3406000 }, { "epoch": 7.130865458686405, "grad_norm": 16.492538452148438, "learning_rate": 1.4375776149960563e-05, "loss": 2.381, "step": 3406500 }, { "epoch": 7.131912114412031, "grad_norm": 14.960488319396973, "learning_rate": 1.437053189347027e-05, "loss": 2.4013, "step": 3407000 }, { "epoch": 7.132958770137656, "grad_norm": 20.9919376373291, "learning_rate": 1.436528763697998e-05, "loss": 2.3705, "step": 3407500 }, { "epoch": 7.134005425863282, "grad_norm": 15.413309097290039, "learning_rate": 1.4360043380489688e-05, "loss": 2.3824, "step": 3408000 }, { "epoch": 7.135052081588907, "grad_norm": 17.779489517211914, "learning_rate": 1.4354799123999397e-05, "loss": 2.3938, "step": 3408500 }, { "epoch": 7.136098737314533, "grad_norm": 18.785537719726562, "learning_rate": 1.4349554867509105e-05, "loss": 2.3867, "step": 3409000 }, { "epoch": 7.137145393040158, "grad_norm": 17.92205238342285, "learning_rate": 1.4344310611018812e-05, "loss": 2.3771, "step": 3409500 }, { "epoch": 7.1381920487657835, "grad_norm": 16.48213005065918, "learning_rate": 1.4339066354528522e-05, "loss": 2.3697, "step": 3410000 }, { "epoch": 7.139238704491409, "grad_norm": 17.08223533630371, "learning_rate": 1.433382209803823e-05, "loss": 2.3767, "step": 3410500 }, { "epoch": 7.140285360217034, "grad_norm": 17.177270889282227, "learning_rate": 1.4328577841547939e-05, "loss": 2.3642, "step": 3411000 }, { "epoch": 7.14133201594266, "grad_norm": 16.66640853881836, "learning_rate": 1.4323333585057646e-05, "loss": 2.3751, "step": 3411500 }, { "epoch": 7.142378671668285, "grad_norm": 19.82608985900879, "learning_rate": 1.4318089328567356e-05, "loss": 2.3945, "step": 3412000 }, { "epoch": 7.143425327393911, "grad_norm": 16.088743209838867, "learning_rate": 1.431284507207706e-05, "loss": 2.3689, "step": 3412500 }, { "epoch": 7.144471983119536, "grad_norm": 27.73908233642578, "learning_rate": 1.430760081558677e-05, "loss": 2.3917, "step": 3413000 }, { "epoch": 7.145518638845162, "grad_norm": 17.94135856628418, "learning_rate": 1.4302356559096477e-05, "loss": 2.3891, "step": 3413500 }, { "epoch": 7.146565294570787, "grad_norm": 16.51995277404785, "learning_rate": 1.4297112302606187e-05, "loss": 2.3839, "step": 3414000 }, { "epoch": 7.147611950296413, "grad_norm": 18.760826110839844, "learning_rate": 1.4291868046115894e-05, "loss": 2.3783, "step": 3414500 }, { "epoch": 7.148658606022038, "grad_norm": 22.288225173950195, "learning_rate": 1.4286623789625602e-05, "loss": 2.3879, "step": 3415000 }, { "epoch": 7.1497052617476635, "grad_norm": 16.029781341552734, "learning_rate": 1.4281379533135311e-05, "loss": 2.3907, "step": 3415500 }, { "epoch": 7.150751917473289, "grad_norm": 15.881861686706543, "learning_rate": 1.4276135276645019e-05, "loss": 2.3842, "step": 3416000 }, { "epoch": 7.151798573198914, "grad_norm": 17.192995071411133, "learning_rate": 1.4270891020154728e-05, "loss": 2.3731, "step": 3416500 }, { "epoch": 7.15284522892454, "grad_norm": 16.743539810180664, "learning_rate": 1.4265646763664436e-05, "loss": 2.3679, "step": 3417000 }, { "epoch": 7.153891884650166, "grad_norm": 17.390663146972656, "learning_rate": 1.4260402507174145e-05, "loss": 2.3672, "step": 3417500 }, { "epoch": 7.154938540375792, "grad_norm": 17.847091674804688, "learning_rate": 1.425515825068385e-05, "loss": 2.3917, "step": 3418000 }, { "epoch": 7.155985196101417, "grad_norm": 17.53412437438965, "learning_rate": 1.424991399419356e-05, "loss": 2.3739, "step": 3418500 }, { "epoch": 7.157031851827043, "grad_norm": 16.3203182220459, "learning_rate": 1.4244669737703267e-05, "loss": 2.3905, "step": 3419000 }, { "epoch": 7.158078507552668, "grad_norm": 18.423967361450195, "learning_rate": 1.4239425481212976e-05, "loss": 2.391, "step": 3419500 }, { "epoch": 7.1591251632782935, "grad_norm": 16.705669403076172, "learning_rate": 1.4234181224722684e-05, "loss": 2.3958, "step": 3420000 }, { "epoch": 7.160171819003919, "grad_norm": 16.880529403686523, "learning_rate": 1.4228936968232393e-05, "loss": 2.3742, "step": 3420500 }, { "epoch": 7.161218474729544, "grad_norm": 18.935564041137695, "learning_rate": 1.4223692711742101e-05, "loss": 2.3903, "step": 3421000 }, { "epoch": 7.16226513045517, "grad_norm": 16.534774780273438, "learning_rate": 1.4218448455251809e-05, "loss": 2.3839, "step": 3421500 }, { "epoch": 7.163311786180795, "grad_norm": 17.498863220214844, "learning_rate": 1.4213204198761518e-05, "loss": 2.3821, "step": 3422000 }, { "epoch": 7.164358441906421, "grad_norm": 16.84607696533203, "learning_rate": 1.4207959942271226e-05, "loss": 2.3806, "step": 3422500 }, { "epoch": 7.165405097632046, "grad_norm": 16.349733352661133, "learning_rate": 1.4202715685780935e-05, "loss": 2.3803, "step": 3423000 }, { "epoch": 7.166451753357672, "grad_norm": 18.60940933227539, "learning_rate": 1.419747142929064e-05, "loss": 2.3818, "step": 3423500 }, { "epoch": 7.167498409083297, "grad_norm": 19.963159561157227, "learning_rate": 1.4192227172800349e-05, "loss": 2.3905, "step": 3424000 }, { "epoch": 7.168545064808923, "grad_norm": 20.304338455200195, "learning_rate": 1.4186982916310057e-05, "loss": 2.3796, "step": 3424500 }, { "epoch": 7.169591720534548, "grad_norm": 18.504100799560547, "learning_rate": 1.4181738659819766e-05, "loss": 2.3788, "step": 3425000 }, { "epoch": 7.1706383762601735, "grad_norm": 17.05401611328125, "learning_rate": 1.4176494403329474e-05, "loss": 2.3727, "step": 3425500 }, { "epoch": 7.171685031985799, "grad_norm": 17.603145599365234, "learning_rate": 1.4171250146839183e-05, "loss": 2.3874, "step": 3426000 }, { "epoch": 7.172731687711424, "grad_norm": 13.457351684570312, "learning_rate": 1.416600589034889e-05, "loss": 2.376, "step": 3426500 }, { "epoch": 7.17377834343705, "grad_norm": 18.103342056274414, "learning_rate": 1.4160761633858598e-05, "loss": 2.359, "step": 3427000 }, { "epoch": 7.174824999162675, "grad_norm": 22.44732093811035, "learning_rate": 1.4155517377368308e-05, "loss": 2.3823, "step": 3427500 }, { "epoch": 7.175871654888301, "grad_norm": 16.69710350036621, "learning_rate": 1.4150273120878015e-05, "loss": 2.3653, "step": 3428000 }, { "epoch": 7.176918310613926, "grad_norm": 16.581018447875977, "learning_rate": 1.4145028864387725e-05, "loss": 2.3873, "step": 3428500 }, { "epoch": 7.177964966339552, "grad_norm": 20.159622192382812, "learning_rate": 1.4139784607897433e-05, "loss": 2.3714, "step": 3429000 }, { "epoch": 7.179011622065177, "grad_norm": 17.29271125793457, "learning_rate": 1.4134540351407139e-05, "loss": 2.3855, "step": 3429500 }, { "epoch": 7.180058277790803, "grad_norm": 15.4208345413208, "learning_rate": 1.4129296094916846e-05, "loss": 2.3846, "step": 3430000 }, { "epoch": 7.181104933516428, "grad_norm": 17.484790802001953, "learning_rate": 1.4124051838426556e-05, "loss": 2.3846, "step": 3430500 }, { "epoch": 7.1821515892420535, "grad_norm": 18.720977783203125, "learning_rate": 1.4118807581936263e-05, "loss": 2.4012, "step": 3431000 }, { "epoch": 7.183198244967679, "grad_norm": 23.973590850830078, "learning_rate": 1.4113563325445973e-05, "loss": 2.3652, "step": 3431500 }, { "epoch": 7.1842449006933045, "grad_norm": 20.37982749938965, "learning_rate": 1.410831906895568e-05, "loss": 2.4124, "step": 3432000 }, { "epoch": 7.18529155641893, "grad_norm": 18.254425048828125, "learning_rate": 1.4103074812465388e-05, "loss": 2.3845, "step": 3432500 }, { "epoch": 7.186338212144555, "grad_norm": 14.116348266601562, "learning_rate": 1.4097830555975097e-05, "loss": 2.3976, "step": 3433000 }, { "epoch": 7.187384867870181, "grad_norm": 19.86227035522461, "learning_rate": 1.4092586299484805e-05, "loss": 2.3741, "step": 3433500 }, { "epoch": 7.188431523595806, "grad_norm": 19.506839752197266, "learning_rate": 1.4087342042994515e-05, "loss": 2.3925, "step": 3434000 }, { "epoch": 7.189478179321432, "grad_norm": 16.10540199279785, "learning_rate": 1.4082097786504222e-05, "loss": 2.3903, "step": 3434500 }, { "epoch": 7.190524835047057, "grad_norm": 17.88289451599121, "learning_rate": 1.4076853530013928e-05, "loss": 2.3816, "step": 3435000 }, { "epoch": 7.1915714907726835, "grad_norm": 17.61490249633789, "learning_rate": 1.4071609273523636e-05, "loss": 2.3716, "step": 3435500 }, { "epoch": 7.192618146498309, "grad_norm": 18.324737548828125, "learning_rate": 1.4066365017033345e-05, "loss": 2.3806, "step": 3436000 }, { "epoch": 7.1936648022239345, "grad_norm": 18.38127326965332, "learning_rate": 1.4061120760543053e-05, "loss": 2.3837, "step": 3436500 }, { "epoch": 7.19471145794956, "grad_norm": 18.042448043823242, "learning_rate": 1.4055876504052762e-05, "loss": 2.3914, "step": 3437000 }, { "epoch": 7.195758113675185, "grad_norm": 15.466233253479004, "learning_rate": 1.405063224756247e-05, "loss": 2.3754, "step": 3437500 }, { "epoch": 7.196804769400811, "grad_norm": 18.894412994384766, "learning_rate": 1.4045387991072178e-05, "loss": 2.3831, "step": 3438000 }, { "epoch": 7.197851425126436, "grad_norm": 18.180158615112305, "learning_rate": 1.4040143734581887e-05, "loss": 2.3768, "step": 3438500 }, { "epoch": 7.198898080852062, "grad_norm": 20.035921096801758, "learning_rate": 1.4034899478091595e-05, "loss": 2.3818, "step": 3439000 }, { "epoch": 7.199944736577687, "grad_norm": 19.39762306213379, "learning_rate": 1.4029655221601304e-05, "loss": 2.386, "step": 3439500 }, { "epoch": 7.200991392303313, "grad_norm": 20.189558029174805, "learning_rate": 1.4024410965111012e-05, "loss": 2.374, "step": 3440000 }, { "epoch": 7.202038048028938, "grad_norm": 17.166223526000977, "learning_rate": 1.4019166708620718e-05, "loss": 2.379, "step": 3440500 }, { "epoch": 7.2030847037545636, "grad_norm": 17.687570571899414, "learning_rate": 1.4013922452130426e-05, "loss": 2.3731, "step": 3441000 }, { "epoch": 7.204131359480189, "grad_norm": 20.29813575744629, "learning_rate": 1.4008678195640135e-05, "loss": 2.3817, "step": 3441500 }, { "epoch": 7.2051780152058145, "grad_norm": 20.724008560180664, "learning_rate": 1.4003433939149843e-05, "loss": 2.3863, "step": 3442000 }, { "epoch": 7.20622467093144, "grad_norm": 18.974609375, "learning_rate": 1.3998189682659552e-05, "loss": 2.3617, "step": 3442500 }, { "epoch": 7.207271326657065, "grad_norm": 18.65865135192871, "learning_rate": 1.399294542616926e-05, "loss": 2.3682, "step": 3443000 }, { "epoch": 7.208317982382691, "grad_norm": 20.575031280517578, "learning_rate": 1.3987701169678969e-05, "loss": 2.3844, "step": 3443500 }, { "epoch": 7.209364638108316, "grad_norm": 19.88442611694336, "learning_rate": 1.3982456913188677e-05, "loss": 2.3716, "step": 3444000 }, { "epoch": 7.210411293833942, "grad_norm": 21.139087677001953, "learning_rate": 1.3977212656698385e-05, "loss": 2.4088, "step": 3444500 }, { "epoch": 7.211457949559567, "grad_norm": 18.223730087280273, "learning_rate": 1.3971968400208094e-05, "loss": 2.3803, "step": 3445000 }, { "epoch": 7.212504605285193, "grad_norm": 14.165038108825684, "learning_rate": 1.3966724143717802e-05, "loss": 2.3813, "step": 3445500 }, { "epoch": 7.213551261010818, "grad_norm": 17.999608993530273, "learning_rate": 1.3961479887227511e-05, "loss": 2.3852, "step": 3446000 }, { "epoch": 7.214597916736444, "grad_norm": 17.977458953857422, "learning_rate": 1.3956235630737215e-05, "loss": 2.3914, "step": 3446500 }, { "epoch": 7.215644572462069, "grad_norm": 17.359933853149414, "learning_rate": 1.3950991374246925e-05, "loss": 2.4084, "step": 3447000 }, { "epoch": 7.2166912281876945, "grad_norm": 16.48793601989746, "learning_rate": 1.3945747117756632e-05, "loss": 2.3752, "step": 3447500 }, { "epoch": 7.21773788391332, "grad_norm": 17.20168685913086, "learning_rate": 1.3940502861266342e-05, "loss": 2.384, "step": 3448000 }, { "epoch": 7.218784539638945, "grad_norm": 18.590103149414062, "learning_rate": 1.393525860477605e-05, "loss": 2.4, "step": 3448500 }, { "epoch": 7.219831195364571, "grad_norm": 16.307937622070312, "learning_rate": 1.3930014348285759e-05, "loss": 2.3646, "step": 3449000 }, { "epoch": 7.220877851090196, "grad_norm": 17.618410110473633, "learning_rate": 1.3924770091795467e-05, "loss": 2.3793, "step": 3449500 }, { "epoch": 7.221924506815822, "grad_norm": 22.031036376953125, "learning_rate": 1.3919525835305174e-05, "loss": 2.3633, "step": 3450000 }, { "epoch": 7.222971162541447, "grad_norm": 18.7186336517334, "learning_rate": 1.3914281578814884e-05, "loss": 2.398, "step": 3450500 }, { "epoch": 7.224017818267073, "grad_norm": 18.984447479248047, "learning_rate": 1.3909037322324591e-05, "loss": 2.3767, "step": 3451000 }, { "epoch": 7.225064473992698, "grad_norm": 17.198902130126953, "learning_rate": 1.39037930658343e-05, "loss": 2.3727, "step": 3451500 }, { "epoch": 7.226111129718324, "grad_norm": 18.4798641204834, "learning_rate": 1.3898548809344005e-05, "loss": 2.3858, "step": 3452000 }, { "epoch": 7.22715778544395, "grad_norm": 17.52542495727539, "learning_rate": 1.3893304552853714e-05, "loss": 2.3636, "step": 3452500 }, { "epoch": 7.228204441169575, "grad_norm": 15.22863483428955, "learning_rate": 1.3888060296363422e-05, "loss": 2.3784, "step": 3453000 }, { "epoch": 7.229251096895201, "grad_norm": 16.522790908813477, "learning_rate": 1.3882816039873131e-05, "loss": 2.3935, "step": 3453500 }, { "epoch": 7.230297752620826, "grad_norm": 17.33462142944336, "learning_rate": 1.3877571783382839e-05, "loss": 2.3801, "step": 3454000 }, { "epoch": 7.231344408346452, "grad_norm": 18.368471145629883, "learning_rate": 1.3872327526892549e-05, "loss": 2.3746, "step": 3454500 }, { "epoch": 7.232391064072077, "grad_norm": 20.086498260498047, "learning_rate": 1.3867083270402256e-05, "loss": 2.378, "step": 3455000 }, { "epoch": 7.233437719797703, "grad_norm": 19.88198471069336, "learning_rate": 1.3861839013911964e-05, "loss": 2.3736, "step": 3455500 }, { "epoch": 7.234484375523328, "grad_norm": 15.554521560668945, "learning_rate": 1.3856594757421673e-05, "loss": 2.3925, "step": 3456000 }, { "epoch": 7.235531031248954, "grad_norm": 19.46774673461914, "learning_rate": 1.3851350500931381e-05, "loss": 2.3779, "step": 3456500 }, { "epoch": 7.236577686974579, "grad_norm": 17.92117691040039, "learning_rate": 1.384610624444109e-05, "loss": 2.3892, "step": 3457000 }, { "epoch": 7.2376243427002045, "grad_norm": 15.169890403747559, "learning_rate": 1.3840861987950796e-05, "loss": 2.3789, "step": 3457500 }, { "epoch": 7.23867099842583, "grad_norm": 17.922649383544922, "learning_rate": 1.3835617731460504e-05, "loss": 2.3757, "step": 3458000 }, { "epoch": 7.239717654151455, "grad_norm": 16.97195053100586, "learning_rate": 1.3830373474970212e-05, "loss": 2.3799, "step": 3458500 }, { "epoch": 7.240764309877081, "grad_norm": 16.438358306884766, "learning_rate": 1.3825129218479921e-05, "loss": 2.3807, "step": 3459000 }, { "epoch": 7.241810965602706, "grad_norm": 16.979961395263672, "learning_rate": 1.3819884961989629e-05, "loss": 2.3816, "step": 3459500 }, { "epoch": 7.242857621328332, "grad_norm": 17.970434188842773, "learning_rate": 1.3814640705499338e-05, "loss": 2.402, "step": 3460000 }, { "epoch": 7.243904277053957, "grad_norm": 17.784198760986328, "learning_rate": 1.3809396449009046e-05, "loss": 2.3903, "step": 3460500 }, { "epoch": 7.244950932779583, "grad_norm": 25.528766632080078, "learning_rate": 1.3804152192518755e-05, "loss": 2.3662, "step": 3461000 }, { "epoch": 7.245997588505208, "grad_norm": 20.28416633605957, "learning_rate": 1.3798907936028463e-05, "loss": 2.3703, "step": 3461500 }, { "epoch": 7.247044244230834, "grad_norm": 16.456377029418945, "learning_rate": 1.379366367953817e-05, "loss": 2.3936, "step": 3462000 }, { "epoch": 7.248090899956459, "grad_norm": 17.00388526916504, "learning_rate": 1.378841942304788e-05, "loss": 2.3756, "step": 3462500 }, { "epoch": 7.2491375556820845, "grad_norm": 18.044286727905273, "learning_rate": 1.3783175166557588e-05, "loss": 2.3679, "step": 3463000 }, { "epoch": 7.25018421140771, "grad_norm": 17.45435333251953, "learning_rate": 1.3777930910067294e-05, "loss": 2.3923, "step": 3463500 }, { "epoch": 7.251230867133335, "grad_norm": 19.143299102783203, "learning_rate": 1.3772686653577001e-05, "loss": 2.3832, "step": 3464000 }, { "epoch": 7.252277522858961, "grad_norm": 20.35745620727539, "learning_rate": 1.376744239708671e-05, "loss": 2.3835, "step": 3464500 }, { "epoch": 7.253324178584586, "grad_norm": 18.699203491210938, "learning_rate": 1.3762198140596419e-05, "loss": 2.3835, "step": 3465000 }, { "epoch": 7.254370834310212, "grad_norm": 18.051782608032227, "learning_rate": 1.3756953884106128e-05, "loss": 2.3783, "step": 3465500 }, { "epoch": 7.255417490035837, "grad_norm": 17.69490623474121, "learning_rate": 1.3751709627615836e-05, "loss": 2.404, "step": 3466000 }, { "epoch": 7.256464145761463, "grad_norm": 19.22551727294922, "learning_rate": 1.3746465371125545e-05, "loss": 2.366, "step": 3466500 }, { "epoch": 7.257510801487088, "grad_norm": 14.914813041687012, "learning_rate": 1.3741221114635253e-05, "loss": 2.372, "step": 3467000 }, { "epoch": 7.258557457212714, "grad_norm": 17.306303024291992, "learning_rate": 1.373597685814496e-05, "loss": 2.3753, "step": 3467500 }, { "epoch": 7.259604112938339, "grad_norm": 17.43406105041504, "learning_rate": 1.373073260165467e-05, "loss": 2.3778, "step": 3468000 }, { "epoch": 7.2606507686639645, "grad_norm": 17.25432586669922, "learning_rate": 1.3725488345164377e-05, "loss": 2.3748, "step": 3468500 }, { "epoch": 7.26169742438959, "grad_norm": 16.936433792114258, "learning_rate": 1.3720244088674083e-05, "loss": 2.3722, "step": 3469000 }, { "epoch": 7.262744080115215, "grad_norm": 18.164871215820312, "learning_rate": 1.3714999832183791e-05, "loss": 2.3885, "step": 3469500 }, { "epoch": 7.263790735840841, "grad_norm": 14.633703231811523, "learning_rate": 1.37097555756935e-05, "loss": 2.3856, "step": 3470000 }, { "epoch": 7.264837391566467, "grad_norm": 18.26593589782715, "learning_rate": 1.3704511319203208e-05, "loss": 2.3685, "step": 3470500 }, { "epoch": 7.265884047292093, "grad_norm": 18.50886344909668, "learning_rate": 1.3699267062712918e-05, "loss": 2.3639, "step": 3471000 }, { "epoch": 7.266930703017718, "grad_norm": 16.706035614013672, "learning_rate": 1.3694022806222625e-05, "loss": 2.3742, "step": 3471500 }, { "epoch": 7.267977358743344, "grad_norm": 22.849077224731445, "learning_rate": 1.3688778549732335e-05, "loss": 2.3789, "step": 3472000 }, { "epoch": 7.269024014468969, "grad_norm": 20.341907501220703, "learning_rate": 1.3683534293242042e-05, "loss": 2.3798, "step": 3472500 }, { "epoch": 7.2700706701945945, "grad_norm": 22.54947280883789, "learning_rate": 1.367829003675175e-05, "loss": 2.3881, "step": 3473000 }, { "epoch": 7.27111732592022, "grad_norm": 17.702354431152344, "learning_rate": 1.367304578026146e-05, "loss": 2.3869, "step": 3473500 }, { "epoch": 7.272163981645845, "grad_norm": 16.74564552307129, "learning_rate": 1.3667801523771167e-05, "loss": 2.3838, "step": 3474000 }, { "epoch": 7.273210637371471, "grad_norm": 16.752174377441406, "learning_rate": 1.3662557267280873e-05, "loss": 2.3883, "step": 3474500 }, { "epoch": 7.274257293097096, "grad_norm": 16.977712631225586, "learning_rate": 1.3657313010790582e-05, "loss": 2.3591, "step": 3475000 }, { "epoch": 7.275303948822722, "grad_norm": 19.515382766723633, "learning_rate": 1.365206875430029e-05, "loss": 2.3856, "step": 3475500 }, { "epoch": 7.276350604548347, "grad_norm": 22.32798194885254, "learning_rate": 1.3646824497809998e-05, "loss": 2.3789, "step": 3476000 }, { "epoch": 7.277397260273973, "grad_norm": 21.058027267456055, "learning_rate": 1.3641580241319707e-05, "loss": 2.3638, "step": 3476500 }, { "epoch": 7.278443915999598, "grad_norm": 16.437631607055664, "learning_rate": 1.3636335984829415e-05, "loss": 2.3695, "step": 3477000 }, { "epoch": 7.279490571725224, "grad_norm": 15.332403182983398, "learning_rate": 1.3631091728339124e-05, "loss": 2.3725, "step": 3477500 }, { "epoch": 7.280537227450849, "grad_norm": 16.405139923095703, "learning_rate": 1.3625847471848832e-05, "loss": 2.3842, "step": 3478000 }, { "epoch": 7.2815838831764745, "grad_norm": 17.668472290039062, "learning_rate": 1.362060321535854e-05, "loss": 2.3994, "step": 3478500 }, { "epoch": 7.2826305389021, "grad_norm": 18.77671241760254, "learning_rate": 1.3615358958868249e-05, "loss": 2.3932, "step": 3479000 }, { "epoch": 7.2836771946277254, "grad_norm": 19.532451629638672, "learning_rate": 1.3610114702377957e-05, "loss": 2.373, "step": 3479500 }, { "epoch": 7.284723850353351, "grad_norm": 18.845226287841797, "learning_rate": 1.3604870445887666e-05, "loss": 2.3702, "step": 3480000 }, { "epoch": 7.285770506078976, "grad_norm": 18.893705368041992, "learning_rate": 1.3599626189397372e-05, "loss": 2.3791, "step": 3480500 }, { "epoch": 7.286817161804602, "grad_norm": 19.556137084960938, "learning_rate": 1.359438193290708e-05, "loss": 2.387, "step": 3481000 }, { "epoch": 7.287863817530227, "grad_norm": 20.357982635498047, "learning_rate": 1.3589137676416788e-05, "loss": 2.3764, "step": 3481500 }, { "epoch": 7.288910473255853, "grad_norm": 18.3829288482666, "learning_rate": 1.3583893419926497e-05, "loss": 2.3761, "step": 3482000 }, { "epoch": 7.289957128981478, "grad_norm": 18.718780517578125, "learning_rate": 1.3578649163436205e-05, "loss": 2.3611, "step": 3482500 }, { "epoch": 7.291003784707104, "grad_norm": 17.074291229248047, "learning_rate": 1.3573404906945914e-05, "loss": 2.3779, "step": 3483000 }, { "epoch": 7.292050440432729, "grad_norm": 17.024784088134766, "learning_rate": 1.3568160650455622e-05, "loss": 2.3807, "step": 3483500 }, { "epoch": 7.2930970961583546, "grad_norm": 19.096649169921875, "learning_rate": 1.3562916393965331e-05, "loss": 2.3674, "step": 3484000 }, { "epoch": 7.29414375188398, "grad_norm": 15.581716537475586, "learning_rate": 1.3557672137475039e-05, "loss": 2.381, "step": 3484500 }, { "epoch": 7.2951904076096055, "grad_norm": 16.5919132232666, "learning_rate": 1.3552427880984746e-05, "loss": 2.3826, "step": 3485000 }, { "epoch": 7.296237063335231, "grad_norm": 16.830644607543945, "learning_rate": 1.3547183624494456e-05, "loss": 2.381, "step": 3485500 }, { "epoch": 7.297283719060856, "grad_norm": 17.565340042114258, "learning_rate": 1.3541939368004162e-05, "loss": 2.3626, "step": 3486000 }, { "epoch": 7.298330374786482, "grad_norm": 15.150001525878906, "learning_rate": 1.353669511151387e-05, "loss": 2.3868, "step": 3486500 }, { "epoch": 7.299377030512108, "grad_norm": 20.103574752807617, "learning_rate": 1.3531450855023577e-05, "loss": 2.3756, "step": 3487000 }, { "epoch": 7.300423686237734, "grad_norm": 18.830411911010742, "learning_rate": 1.3526206598533287e-05, "loss": 2.3792, "step": 3487500 }, { "epoch": 7.301470341963359, "grad_norm": 16.209253311157227, "learning_rate": 1.3520962342042994e-05, "loss": 2.3797, "step": 3488000 }, { "epoch": 7.3025169976889845, "grad_norm": 14.904646873474121, "learning_rate": 1.3515718085552704e-05, "loss": 2.3762, "step": 3488500 }, { "epoch": 7.30356365341461, "grad_norm": 16.511104583740234, "learning_rate": 1.3510473829062411e-05, "loss": 2.3756, "step": 3489000 }, { "epoch": 7.3046103091402355, "grad_norm": 16.079652786254883, "learning_rate": 1.350522957257212e-05, "loss": 2.3604, "step": 3489500 }, { "epoch": 7.305656964865861, "grad_norm": 17.167346954345703, "learning_rate": 1.3499985316081828e-05, "loss": 2.3903, "step": 3490000 }, { "epoch": 7.306703620591486, "grad_norm": 19.670536041259766, "learning_rate": 1.3494741059591536e-05, "loss": 2.3912, "step": 3490500 }, { "epoch": 7.307750276317112, "grad_norm": 17.933195114135742, "learning_rate": 1.3489496803101246e-05, "loss": 2.3746, "step": 3491000 }, { "epoch": 7.308796932042737, "grad_norm": 17.1979923248291, "learning_rate": 1.3484252546610953e-05, "loss": 2.3715, "step": 3491500 }, { "epoch": 7.309843587768363, "grad_norm": 16.457887649536133, "learning_rate": 1.347900829012066e-05, "loss": 2.3825, "step": 3492000 }, { "epoch": 7.310890243493988, "grad_norm": 15.433152198791504, "learning_rate": 1.3473764033630367e-05, "loss": 2.4029, "step": 3492500 }, { "epoch": 7.311936899219614, "grad_norm": 19.588403701782227, "learning_rate": 1.3468519777140076e-05, "loss": 2.3586, "step": 3493000 }, { "epoch": 7.312983554945239, "grad_norm": 17.45609474182129, "learning_rate": 1.3463275520649784e-05, "loss": 2.3806, "step": 3493500 }, { "epoch": 7.314030210670865, "grad_norm": 14.873995780944824, "learning_rate": 1.3458031264159493e-05, "loss": 2.388, "step": 3494000 }, { "epoch": 7.31507686639649, "grad_norm": 17.628376007080078, "learning_rate": 1.3452787007669201e-05, "loss": 2.3831, "step": 3494500 }, { "epoch": 7.3161235221221155, "grad_norm": 18.4437255859375, "learning_rate": 1.344754275117891e-05, "loss": 2.3821, "step": 3495000 }, { "epoch": 7.317170177847741, "grad_norm": 14.227787971496582, "learning_rate": 1.3442298494688618e-05, "loss": 2.3781, "step": 3495500 }, { "epoch": 7.318216833573366, "grad_norm": 18.348297119140625, "learning_rate": 1.3437054238198326e-05, "loss": 2.3439, "step": 3496000 }, { "epoch": 7.319263489298992, "grad_norm": 21.76491355895996, "learning_rate": 1.3431809981708035e-05, "loss": 2.3688, "step": 3496500 }, { "epoch": 7.320310145024617, "grad_norm": 14.813722610473633, "learning_rate": 1.3426565725217743e-05, "loss": 2.3755, "step": 3497000 }, { "epoch": 7.321356800750243, "grad_norm": 19.352645874023438, "learning_rate": 1.3421321468727449e-05, "loss": 2.3593, "step": 3497500 }, { "epoch": 7.322403456475868, "grad_norm": 20.554040908813477, "learning_rate": 1.3416077212237158e-05, "loss": 2.3789, "step": 3498000 }, { "epoch": 7.323450112201494, "grad_norm": 18.586103439331055, "learning_rate": 1.3410832955746866e-05, "loss": 2.3768, "step": 3498500 }, { "epoch": 7.324496767927119, "grad_norm": 15.043233871459961, "learning_rate": 1.3405588699256574e-05, "loss": 2.3615, "step": 3499000 }, { "epoch": 7.325543423652745, "grad_norm": 17.494476318359375, "learning_rate": 1.3400344442766283e-05, "loss": 2.3705, "step": 3499500 }, { "epoch": 7.32659007937837, "grad_norm": 16.706954956054688, "learning_rate": 1.339510018627599e-05, "loss": 2.3824, "step": 3500000 }, { "epoch": 7.3276367351039955, "grad_norm": 15.957395553588867, "learning_rate": 1.33898559297857e-05, "loss": 2.3896, "step": 3500500 }, { "epoch": 7.328683390829621, "grad_norm": 17.683700561523438, "learning_rate": 1.3384611673295408e-05, "loss": 2.3862, "step": 3501000 }, { "epoch": 7.329730046555246, "grad_norm": 21.442228317260742, "learning_rate": 1.3379367416805117e-05, "loss": 2.3797, "step": 3501500 }, { "epoch": 7.330776702280872, "grad_norm": 19.653400421142578, "learning_rate": 1.3374123160314825e-05, "loss": 2.3474, "step": 3502000 }, { "epoch": 7.331823358006497, "grad_norm": 20.787792205810547, "learning_rate": 1.3368878903824533e-05, "loss": 2.3582, "step": 3502500 }, { "epoch": 7.332870013732123, "grad_norm": 19.14055633544922, "learning_rate": 1.3363634647334239e-05, "loss": 2.3644, "step": 3503000 }, { "epoch": 7.333916669457748, "grad_norm": 19.036117553710938, "learning_rate": 1.3358390390843948e-05, "loss": 2.3728, "step": 3503500 }, { "epoch": 7.334963325183374, "grad_norm": 17.70400047302246, "learning_rate": 1.3353146134353656e-05, "loss": 2.3582, "step": 3504000 }, { "epoch": 7.336009980908999, "grad_norm": 20.614776611328125, "learning_rate": 1.3347901877863363e-05, "loss": 2.3612, "step": 3504500 }, { "epoch": 7.337056636634625, "grad_norm": 19.500688552856445, "learning_rate": 1.3342657621373073e-05, "loss": 2.3711, "step": 3505000 }, { "epoch": 7.338103292360251, "grad_norm": 13.027121543884277, "learning_rate": 1.333741336488278e-05, "loss": 2.3906, "step": 3505500 }, { "epoch": 7.339149948085876, "grad_norm": 18.308683395385742, "learning_rate": 1.333216910839249e-05, "loss": 2.3538, "step": 3506000 }, { "epoch": 7.340196603811502, "grad_norm": 19.23273277282715, "learning_rate": 1.3326924851902198e-05, "loss": 2.377, "step": 3506500 }, { "epoch": 7.341243259537127, "grad_norm": 18.36081314086914, "learning_rate": 1.3321680595411907e-05, "loss": 2.3887, "step": 3507000 }, { "epoch": 7.342289915262753, "grad_norm": 17.249900817871094, "learning_rate": 1.3316436338921615e-05, "loss": 2.3678, "step": 3507500 }, { "epoch": 7.343336570988378, "grad_norm": 17.25297737121582, "learning_rate": 1.3311192082431322e-05, "loss": 2.382, "step": 3508000 }, { "epoch": 7.344383226714004, "grad_norm": 19.365009307861328, "learning_rate": 1.3305947825941032e-05, "loss": 2.3765, "step": 3508500 }, { "epoch": 7.345429882439629, "grad_norm": 16.37972068786621, "learning_rate": 1.3300703569450738e-05, "loss": 2.3646, "step": 3509000 }, { "epoch": 7.346476538165255, "grad_norm": 15.454504013061523, "learning_rate": 1.3295459312960445e-05, "loss": 2.3662, "step": 3509500 }, { "epoch": 7.34752319389088, "grad_norm": 18.601943969726562, "learning_rate": 1.3290215056470153e-05, "loss": 2.3826, "step": 3510000 }, { "epoch": 7.3485698496165055, "grad_norm": 15.3634033203125, "learning_rate": 1.3284970799979862e-05, "loss": 2.3681, "step": 3510500 }, { "epoch": 7.349616505342131, "grad_norm": 17.81190299987793, "learning_rate": 1.327972654348957e-05, "loss": 2.3828, "step": 3511000 }, { "epoch": 7.350663161067756, "grad_norm": 16.946853637695312, "learning_rate": 1.327448228699928e-05, "loss": 2.3623, "step": 3511500 }, { "epoch": 7.351709816793382, "grad_norm": 19.186904907226562, "learning_rate": 1.3269238030508987e-05, "loss": 2.3857, "step": 3512000 }, { "epoch": 7.352756472519007, "grad_norm": 15.008978843688965, "learning_rate": 1.3263993774018697e-05, "loss": 2.3618, "step": 3512500 }, { "epoch": 7.353803128244633, "grad_norm": 20.09295654296875, "learning_rate": 1.3258749517528404e-05, "loss": 2.3615, "step": 3513000 }, { "epoch": 7.354849783970258, "grad_norm": 18.438364028930664, "learning_rate": 1.3253505261038112e-05, "loss": 2.3909, "step": 3513500 }, { "epoch": 7.355896439695884, "grad_norm": 18.403173446655273, "learning_rate": 1.3248261004547821e-05, "loss": 2.3833, "step": 3514000 }, { "epoch": 7.356943095421509, "grad_norm": 26.272130966186523, "learning_rate": 1.3243016748057527e-05, "loss": 2.3796, "step": 3514500 }, { "epoch": 7.357989751147135, "grad_norm": 21.533632278442383, "learning_rate": 1.3237772491567235e-05, "loss": 2.369, "step": 3515000 }, { "epoch": 7.35903640687276, "grad_norm": 18.233627319335938, "learning_rate": 1.3232528235076944e-05, "loss": 2.3823, "step": 3515500 }, { "epoch": 7.3600830625983855, "grad_norm": 19.524412155151367, "learning_rate": 1.3227283978586652e-05, "loss": 2.3649, "step": 3516000 }, { "epoch": 7.361129718324011, "grad_norm": 30.139442443847656, "learning_rate": 1.322203972209636e-05, "loss": 2.378, "step": 3516500 }, { "epoch": 7.362176374049636, "grad_norm": 17.160673141479492, "learning_rate": 1.321679546560607e-05, "loss": 2.3668, "step": 3517000 }, { "epoch": 7.363223029775262, "grad_norm": 15.50678539276123, "learning_rate": 1.3211551209115777e-05, "loss": 2.3781, "step": 3517500 }, { "epoch": 7.364269685500887, "grad_norm": 16.91550636291504, "learning_rate": 1.3206306952625486e-05, "loss": 2.3544, "step": 3518000 }, { "epoch": 7.365316341226513, "grad_norm": 16.353561401367188, "learning_rate": 1.3201062696135194e-05, "loss": 2.3538, "step": 3518500 }, { "epoch": 7.366362996952138, "grad_norm": 16.31477165222168, "learning_rate": 1.3195818439644902e-05, "loss": 2.366, "step": 3519000 }, { "epoch": 7.367409652677764, "grad_norm": 21.375059127807617, "learning_rate": 1.3190574183154611e-05, "loss": 2.3677, "step": 3519500 }, { "epoch": 7.368456308403389, "grad_norm": 18.559585571289062, "learning_rate": 1.3185329926664317e-05, "loss": 2.3693, "step": 3520000 }, { "epoch": 7.369502964129015, "grad_norm": 17.31613540649414, "learning_rate": 1.3180085670174025e-05, "loss": 2.3714, "step": 3520500 }, { "epoch": 7.37054961985464, "grad_norm": 22.6802921295166, "learning_rate": 1.3174841413683734e-05, "loss": 2.346, "step": 3521000 }, { "epoch": 7.3715962755802655, "grad_norm": 16.32378387451172, "learning_rate": 1.3169597157193442e-05, "loss": 2.3868, "step": 3521500 }, { "epoch": 7.372642931305892, "grad_norm": 19.827877044677734, "learning_rate": 1.316435290070315e-05, "loss": 2.3645, "step": 3522000 }, { "epoch": 7.373689587031517, "grad_norm": 17.122800827026367, "learning_rate": 1.3159108644212859e-05, "loss": 2.3671, "step": 3522500 }, { "epoch": 7.374736242757143, "grad_norm": 15.903850555419922, "learning_rate": 1.3153864387722567e-05, "loss": 2.3684, "step": 3523000 }, { "epoch": 7.375782898482768, "grad_norm": 16.778366088867188, "learning_rate": 1.3148620131232276e-05, "loss": 2.3684, "step": 3523500 }, { "epoch": 7.376829554208394, "grad_norm": 22.30548095703125, "learning_rate": 1.3143375874741984e-05, "loss": 2.3551, "step": 3524000 }, { "epoch": 7.377876209934019, "grad_norm": 19.376998901367188, "learning_rate": 1.3138131618251693e-05, "loss": 2.3688, "step": 3524500 }, { "epoch": 7.378922865659645, "grad_norm": 21.060102462768555, "learning_rate": 1.31328873617614e-05, "loss": 2.3853, "step": 3525000 }, { "epoch": 7.37996952138527, "grad_norm": 16.948348999023438, "learning_rate": 1.3127643105271108e-05, "loss": 2.38, "step": 3525500 }, { "epoch": 7.3810161771108955, "grad_norm": 20.909549713134766, "learning_rate": 1.3122398848780814e-05, "loss": 2.3861, "step": 3526000 }, { "epoch": 7.382062832836521, "grad_norm": 17.628116607666016, "learning_rate": 1.3117154592290524e-05, "loss": 2.3786, "step": 3526500 }, { "epoch": 7.383109488562146, "grad_norm": 15.83138370513916, "learning_rate": 1.3111910335800232e-05, "loss": 2.378, "step": 3527000 }, { "epoch": 7.384156144287772, "grad_norm": 22.097591400146484, "learning_rate": 1.310666607930994e-05, "loss": 2.363, "step": 3527500 }, { "epoch": 7.385202800013397, "grad_norm": 20.106815338134766, "learning_rate": 1.3101421822819649e-05, "loss": 2.3767, "step": 3528000 }, { "epoch": 7.386249455739023, "grad_norm": 16.397663116455078, "learning_rate": 1.3096177566329356e-05, "loss": 2.3511, "step": 3528500 }, { "epoch": 7.387296111464648, "grad_norm": 16.920557022094727, "learning_rate": 1.3090933309839066e-05, "loss": 2.3638, "step": 3529000 }, { "epoch": 7.388342767190274, "grad_norm": 16.737510681152344, "learning_rate": 1.3085689053348773e-05, "loss": 2.3538, "step": 3529500 }, { "epoch": 7.389389422915899, "grad_norm": 19.72517967224121, "learning_rate": 1.3080444796858483e-05, "loss": 2.3739, "step": 3530000 }, { "epoch": 7.390436078641525, "grad_norm": 15.95637321472168, "learning_rate": 1.307520054036819e-05, "loss": 2.3933, "step": 3530500 }, { "epoch": 7.39148273436715, "grad_norm": 16.674161911010742, "learning_rate": 1.3069956283877898e-05, "loss": 2.3748, "step": 3531000 }, { "epoch": 7.3925293900927755, "grad_norm": 19.99057960510254, "learning_rate": 1.3064712027387604e-05, "loss": 2.3735, "step": 3531500 }, { "epoch": 7.393576045818401, "grad_norm": 18.377361297607422, "learning_rate": 1.3059467770897314e-05, "loss": 2.367, "step": 3532000 }, { "epoch": 7.3946227015440265, "grad_norm": 18.335983276367188, "learning_rate": 1.3054223514407021e-05, "loss": 2.3799, "step": 3532500 }, { "epoch": 7.395669357269652, "grad_norm": 17.225664138793945, "learning_rate": 1.3048979257916729e-05, "loss": 2.3753, "step": 3533000 }, { "epoch": 7.396716012995277, "grad_norm": 16.507308959960938, "learning_rate": 1.3043735001426438e-05, "loss": 2.3661, "step": 3533500 }, { "epoch": 7.397762668720903, "grad_norm": 19.299785614013672, "learning_rate": 1.3038490744936146e-05, "loss": 2.3769, "step": 3534000 }, { "epoch": 7.398809324446528, "grad_norm": 15.64687442779541, "learning_rate": 1.3033246488445855e-05, "loss": 2.3748, "step": 3534500 }, { "epoch": 7.399855980172154, "grad_norm": 21.82071876525879, "learning_rate": 1.3028002231955563e-05, "loss": 2.3663, "step": 3535000 }, { "epoch": 7.400902635897779, "grad_norm": 17.88788414001465, "learning_rate": 1.3022757975465272e-05, "loss": 2.3651, "step": 3535500 }, { "epoch": 7.401949291623405, "grad_norm": 18.57623863220215, "learning_rate": 1.301751371897498e-05, "loss": 2.3981, "step": 3536000 }, { "epoch": 7.40299594734903, "grad_norm": 17.264381408691406, "learning_rate": 1.3012269462484688e-05, "loss": 2.4001, "step": 3536500 }, { "epoch": 7.404042603074656, "grad_norm": 17.2443790435791, "learning_rate": 1.3007025205994394e-05, "loss": 2.374, "step": 3537000 }, { "epoch": 7.405089258800281, "grad_norm": 19.269193649291992, "learning_rate": 1.3001780949504103e-05, "loss": 2.3859, "step": 3537500 }, { "epoch": 7.4061359145259065, "grad_norm": 16.647506713867188, "learning_rate": 1.2996536693013811e-05, "loss": 2.3564, "step": 3538000 }, { "epoch": 7.407182570251532, "grad_norm": 17.73529052734375, "learning_rate": 1.299129243652352e-05, "loss": 2.3905, "step": 3538500 }, { "epoch": 7.408229225977157, "grad_norm": 17.64923095703125, "learning_rate": 1.2986048180033228e-05, "loss": 2.3694, "step": 3539000 }, { "epoch": 7.409275881702783, "grad_norm": 19.414398193359375, "learning_rate": 1.2980803923542936e-05, "loss": 2.3636, "step": 3539500 }, { "epoch": 7.410322537428408, "grad_norm": 16.949237823486328, "learning_rate": 1.2975559667052645e-05, "loss": 2.382, "step": 3540000 }, { "epoch": 7.411369193154035, "grad_norm": 16.68345832824707, "learning_rate": 1.2970315410562353e-05, "loss": 2.3729, "step": 3540500 }, { "epoch": 7.41241584887966, "grad_norm": 20.30150604248047, "learning_rate": 1.2965071154072062e-05, "loss": 2.3654, "step": 3541000 }, { "epoch": 7.4134625046052856, "grad_norm": 16.591962814331055, "learning_rate": 1.295982689758177e-05, "loss": 2.3678, "step": 3541500 }, { "epoch": 7.414509160330911, "grad_norm": 18.633663177490234, "learning_rate": 1.295458264109148e-05, "loss": 2.3575, "step": 3542000 }, { "epoch": 7.4155558160565365, "grad_norm": 22.5517520904541, "learning_rate": 1.2949338384601187e-05, "loss": 2.3876, "step": 3542500 }, { "epoch": 7.416602471782162, "grad_norm": 19.602182388305664, "learning_rate": 1.2944094128110893e-05, "loss": 2.3635, "step": 3543000 }, { "epoch": 7.417649127507787, "grad_norm": 19.931636810302734, "learning_rate": 1.29388498716206e-05, "loss": 2.3768, "step": 3543500 }, { "epoch": 7.418695783233413, "grad_norm": 18.463232040405273, "learning_rate": 1.293360561513031e-05, "loss": 2.3818, "step": 3544000 }, { "epoch": 7.419742438959038, "grad_norm": 18.61618995666504, "learning_rate": 1.2928361358640018e-05, "loss": 2.3437, "step": 3544500 }, { "epoch": 7.420789094684664, "grad_norm": 17.8369197845459, "learning_rate": 1.2923117102149725e-05, "loss": 2.3768, "step": 3545000 }, { "epoch": 7.421835750410289, "grad_norm": 19.50063133239746, "learning_rate": 1.2917872845659435e-05, "loss": 2.3877, "step": 3545500 }, { "epoch": 7.422882406135915, "grad_norm": 17.00188636779785, "learning_rate": 1.2912628589169142e-05, "loss": 2.3552, "step": 3546000 }, { "epoch": 7.42392906186154, "grad_norm": 16.9198055267334, "learning_rate": 1.2907384332678852e-05, "loss": 2.3701, "step": 3546500 }, { "epoch": 7.424975717587166, "grad_norm": 17.477481842041016, "learning_rate": 1.290214007618856e-05, "loss": 2.3375, "step": 3547000 }, { "epoch": 7.426022373312791, "grad_norm": 17.562456130981445, "learning_rate": 1.2896895819698269e-05, "loss": 2.3672, "step": 3547500 }, { "epoch": 7.4270690290384165, "grad_norm": 17.641857147216797, "learning_rate": 1.2891651563207977e-05, "loss": 2.3907, "step": 3548000 }, { "epoch": 7.428115684764042, "grad_norm": 15.63657283782959, "learning_rate": 1.2886407306717683e-05, "loss": 2.3812, "step": 3548500 }, { "epoch": 7.429162340489667, "grad_norm": 21.651147842407227, "learning_rate": 1.288116305022739e-05, "loss": 2.3483, "step": 3549000 }, { "epoch": 7.430208996215293, "grad_norm": 18.99908447265625, "learning_rate": 1.28759187937371e-05, "loss": 2.3695, "step": 3549500 }, { "epoch": 7.431255651940918, "grad_norm": 14.614758491516113, "learning_rate": 1.2870674537246807e-05, "loss": 2.3634, "step": 3550000 }, { "epoch": 7.432302307666544, "grad_norm": 17.141494750976562, "learning_rate": 1.2865430280756515e-05, "loss": 2.3732, "step": 3550500 }, { "epoch": 7.433348963392169, "grad_norm": 16.93023681640625, "learning_rate": 1.2860186024266224e-05, "loss": 2.3816, "step": 3551000 }, { "epoch": 7.434395619117795, "grad_norm": 19.005653381347656, "learning_rate": 1.2854941767775932e-05, "loss": 2.3764, "step": 3551500 }, { "epoch": 7.43544227484342, "grad_norm": 17.888469696044922, "learning_rate": 1.2849697511285641e-05, "loss": 2.3518, "step": 3552000 }, { "epoch": 7.436488930569046, "grad_norm": 18.05312728881836, "learning_rate": 1.284445325479535e-05, "loss": 2.3808, "step": 3552500 }, { "epoch": 7.437535586294671, "grad_norm": 16.424402236938477, "learning_rate": 1.2839208998305059e-05, "loss": 2.3774, "step": 3553000 }, { "epoch": 7.4385822420202965, "grad_norm": 19.23243522644043, "learning_rate": 1.2833964741814766e-05, "loss": 2.3613, "step": 3553500 }, { "epoch": 7.439628897745922, "grad_norm": 18.802967071533203, "learning_rate": 1.2828720485324472e-05, "loss": 2.3867, "step": 3554000 }, { "epoch": 7.440675553471547, "grad_norm": 17.629985809326172, "learning_rate": 1.282347622883418e-05, "loss": 2.3885, "step": 3554500 }, { "epoch": 7.441722209197173, "grad_norm": 18.919937133789062, "learning_rate": 1.281823197234389e-05, "loss": 2.3828, "step": 3555000 }, { "epoch": 7.442768864922798, "grad_norm": 18.489850997924805, "learning_rate": 1.2812987715853597e-05, "loss": 2.3784, "step": 3555500 }, { "epoch": 7.443815520648424, "grad_norm": 20.567609786987305, "learning_rate": 1.2807743459363305e-05, "loss": 2.3637, "step": 3556000 }, { "epoch": 7.444862176374049, "grad_norm": 20.817058563232422, "learning_rate": 1.2802499202873014e-05, "loss": 2.3545, "step": 3556500 }, { "epoch": 7.445908832099676, "grad_norm": 17.731416702270508, "learning_rate": 1.2797254946382722e-05, "loss": 2.3892, "step": 3557000 }, { "epoch": 7.446955487825301, "grad_norm": 16.061843872070312, "learning_rate": 1.2792010689892431e-05, "loss": 2.3622, "step": 3557500 }, { "epoch": 7.4480021435509265, "grad_norm": 15.614197731018066, "learning_rate": 1.2786766433402139e-05, "loss": 2.3818, "step": 3558000 }, { "epoch": 7.449048799276552, "grad_norm": 18.474166870117188, "learning_rate": 1.2781522176911848e-05, "loss": 2.3824, "step": 3558500 }, { "epoch": 7.450095455002177, "grad_norm": 15.643153190612793, "learning_rate": 1.2776277920421556e-05, "loss": 2.3745, "step": 3559000 }, { "epoch": 7.451142110727803, "grad_norm": 20.82729721069336, "learning_rate": 1.2771033663931264e-05, "loss": 2.3471, "step": 3559500 }, { "epoch": 7.452188766453428, "grad_norm": 16.913122177124023, "learning_rate": 1.276578940744097e-05, "loss": 2.3763, "step": 3560000 }, { "epoch": 7.453235422179054, "grad_norm": 15.815290451049805, "learning_rate": 1.2760545150950679e-05, "loss": 2.3889, "step": 3560500 }, { "epoch": 7.454282077904679, "grad_norm": 17.0493221282959, "learning_rate": 1.2755300894460387e-05, "loss": 2.3519, "step": 3561000 }, { "epoch": 7.455328733630305, "grad_norm": 16.15826988220215, "learning_rate": 1.2750056637970096e-05, "loss": 2.3637, "step": 3561500 }, { "epoch": 7.45637538935593, "grad_norm": 16.021862030029297, "learning_rate": 1.2744812381479804e-05, "loss": 2.3672, "step": 3562000 }, { "epoch": 7.457422045081556, "grad_norm": 18.90130615234375, "learning_rate": 1.2739568124989511e-05, "loss": 2.3606, "step": 3562500 }, { "epoch": 7.458468700807181, "grad_norm": 19.653179168701172, "learning_rate": 1.2734323868499221e-05, "loss": 2.3619, "step": 3563000 }, { "epoch": 7.4595153565328065, "grad_norm": 16.364742279052734, "learning_rate": 1.2729079612008929e-05, "loss": 2.3666, "step": 3563500 }, { "epoch": 7.460562012258432, "grad_norm": 17.7165470123291, "learning_rate": 1.2723835355518638e-05, "loss": 2.363, "step": 3564000 }, { "epoch": 7.461608667984057, "grad_norm": 18.70760726928711, "learning_rate": 1.2718591099028346e-05, "loss": 2.3697, "step": 3564500 }, { "epoch": 7.462655323709683, "grad_norm": 18.036090850830078, "learning_rate": 1.2713346842538055e-05, "loss": 2.3764, "step": 3565000 }, { "epoch": 7.463701979435308, "grad_norm": 17.787322998046875, "learning_rate": 1.270810258604776e-05, "loss": 2.3418, "step": 3565500 }, { "epoch": 7.464748635160934, "grad_norm": 18.645832061767578, "learning_rate": 1.2702858329557469e-05, "loss": 2.3624, "step": 3566000 }, { "epoch": 7.465795290886559, "grad_norm": 17.241037368774414, "learning_rate": 1.2697614073067176e-05, "loss": 2.3748, "step": 3566500 }, { "epoch": 7.466841946612185, "grad_norm": 18.31142234802246, "learning_rate": 1.2692369816576886e-05, "loss": 2.364, "step": 3567000 }, { "epoch": 7.46788860233781, "grad_norm": 21.1509952545166, "learning_rate": 1.2687125560086593e-05, "loss": 2.3632, "step": 3567500 }, { "epoch": 7.468935258063436, "grad_norm": 19.139062881469727, "learning_rate": 1.2681881303596301e-05, "loss": 2.3484, "step": 3568000 }, { "epoch": 7.469981913789061, "grad_norm": 17.202024459838867, "learning_rate": 1.267663704710601e-05, "loss": 2.3879, "step": 3568500 }, { "epoch": 7.4710285695146865, "grad_norm": 15.724287033081055, "learning_rate": 1.2671392790615718e-05, "loss": 2.3681, "step": 3569000 }, { "epoch": 7.472075225240312, "grad_norm": 20.57360076904297, "learning_rate": 1.2666148534125428e-05, "loss": 2.3598, "step": 3569500 }, { "epoch": 7.473121880965937, "grad_norm": 21.914960861206055, "learning_rate": 1.2660904277635135e-05, "loss": 2.3573, "step": 3570000 }, { "epoch": 7.474168536691563, "grad_norm": 18.872285842895508, "learning_rate": 1.2655660021144845e-05, "loss": 2.3811, "step": 3570500 }, { "epoch": 7.475215192417188, "grad_norm": 20.132343292236328, "learning_rate": 1.2650415764654549e-05, "loss": 2.3742, "step": 3571000 }, { "epoch": 7.476261848142814, "grad_norm": 18.111797332763672, "learning_rate": 1.2645171508164258e-05, "loss": 2.3662, "step": 3571500 }, { "epoch": 7.477308503868439, "grad_norm": 16.286914825439453, "learning_rate": 1.2639927251673966e-05, "loss": 2.3876, "step": 3572000 }, { "epoch": 7.478355159594065, "grad_norm": 19.086654663085938, "learning_rate": 1.2634682995183675e-05, "loss": 2.3785, "step": 3572500 }, { "epoch": 7.47940181531969, "grad_norm": 20.76500129699707, "learning_rate": 1.2629438738693383e-05, "loss": 2.3766, "step": 3573000 }, { "epoch": 7.480448471045316, "grad_norm": 23.715335845947266, "learning_rate": 1.262419448220309e-05, "loss": 2.374, "step": 3573500 }, { "epoch": 7.481495126770941, "grad_norm": 17.916696548461914, "learning_rate": 1.26189502257128e-05, "loss": 2.3599, "step": 3574000 }, { "epoch": 7.4825417824965665, "grad_norm": 19.579761505126953, "learning_rate": 1.2613705969222508e-05, "loss": 2.353, "step": 3574500 }, { "epoch": 7.483588438222192, "grad_norm": 18.071317672729492, "learning_rate": 1.2608461712732217e-05, "loss": 2.3649, "step": 3575000 }, { "epoch": 7.484635093947818, "grad_norm": 19.490652084350586, "learning_rate": 1.2603217456241925e-05, "loss": 2.3817, "step": 3575500 }, { "epoch": 7.485681749673444, "grad_norm": 14.729349136352539, "learning_rate": 1.2597973199751634e-05, "loss": 2.3683, "step": 3576000 }, { "epoch": 7.486728405399069, "grad_norm": 15.997456550598145, "learning_rate": 1.2592728943261342e-05, "loss": 2.3703, "step": 3576500 }, { "epoch": 7.487775061124695, "grad_norm": 18.638172149658203, "learning_rate": 1.2587484686771048e-05, "loss": 2.3829, "step": 3577000 }, { "epoch": 7.48882171685032, "grad_norm": 15.72919750213623, "learning_rate": 1.2582240430280756e-05, "loss": 2.3673, "step": 3577500 }, { "epoch": 7.489868372575946, "grad_norm": 31.596546173095703, "learning_rate": 1.2576996173790465e-05, "loss": 2.3601, "step": 3578000 }, { "epoch": 7.490915028301571, "grad_norm": 17.448043823242188, "learning_rate": 1.2571751917300173e-05, "loss": 2.3625, "step": 3578500 }, { "epoch": 7.4919616840271965, "grad_norm": 18.846046447753906, "learning_rate": 1.2566507660809882e-05, "loss": 2.3644, "step": 3579000 }, { "epoch": 7.493008339752822, "grad_norm": 16.371604919433594, "learning_rate": 1.256126340431959e-05, "loss": 2.366, "step": 3579500 }, { "epoch": 7.4940549954784474, "grad_norm": 17.425708770751953, "learning_rate": 1.2556019147829298e-05, "loss": 2.3638, "step": 3580000 }, { "epoch": 7.495101651204073, "grad_norm": 15.642621994018555, "learning_rate": 1.2550774891339007e-05, "loss": 2.3789, "step": 3580500 }, { "epoch": 7.496148306929698, "grad_norm": 18.16220474243164, "learning_rate": 1.2545530634848715e-05, "loss": 2.3799, "step": 3581000 }, { "epoch": 7.497194962655324, "grad_norm": 18.752153396606445, "learning_rate": 1.2540286378358424e-05, "loss": 2.364, "step": 3581500 }, { "epoch": 7.498241618380949, "grad_norm": 18.296201705932617, "learning_rate": 1.2535042121868132e-05, "loss": 2.3669, "step": 3582000 }, { "epoch": 7.499288274106575, "grad_norm": 18.086414337158203, "learning_rate": 1.2529797865377838e-05, "loss": 2.3637, "step": 3582500 }, { "epoch": 7.5003349298322, "grad_norm": 21.502534866333008, "learning_rate": 1.2524553608887545e-05, "loss": 2.3609, "step": 3583000 }, { "epoch": 7.501381585557826, "grad_norm": 17.29347038269043, "learning_rate": 1.2519309352397255e-05, "loss": 2.3859, "step": 3583500 }, { "epoch": 7.502428241283451, "grad_norm": 18.883737564086914, "learning_rate": 1.2514065095906963e-05, "loss": 2.3686, "step": 3584000 }, { "epoch": 7.5034748970090765, "grad_norm": 21.106685638427734, "learning_rate": 1.2508820839416672e-05, "loss": 2.3749, "step": 3584500 }, { "epoch": 7.504521552734702, "grad_norm": 17.24539566040039, "learning_rate": 1.250357658292638e-05, "loss": 2.3771, "step": 3585000 }, { "epoch": 7.5055682084603275, "grad_norm": 16.53855323791504, "learning_rate": 1.2498332326436087e-05, "loss": 2.3593, "step": 3585500 }, { "epoch": 7.506614864185953, "grad_norm": 21.995073318481445, "learning_rate": 1.2493088069945797e-05, "loss": 2.3683, "step": 3586000 }, { "epoch": 7.507661519911578, "grad_norm": 15.672816276550293, "learning_rate": 1.2487843813455504e-05, "loss": 2.3714, "step": 3586500 }, { "epoch": 7.508708175637204, "grad_norm": 20.526050567626953, "learning_rate": 1.2482599556965212e-05, "loss": 2.366, "step": 3587000 }, { "epoch": 7.509754831362829, "grad_norm": 18.716522216796875, "learning_rate": 1.247735530047492e-05, "loss": 2.374, "step": 3587500 }, { "epoch": 7.510801487088455, "grad_norm": 17.841156005859375, "learning_rate": 1.2472111043984629e-05, "loss": 2.3582, "step": 3588000 }, { "epoch": 7.51184814281408, "grad_norm": 17.687273025512695, "learning_rate": 1.2466866787494337e-05, "loss": 2.374, "step": 3588500 }, { "epoch": 7.512894798539706, "grad_norm": 21.368059158325195, "learning_rate": 1.2461622531004046e-05, "loss": 2.3781, "step": 3589000 }, { "epoch": 7.513941454265331, "grad_norm": 18.183948516845703, "learning_rate": 1.2456378274513752e-05, "loss": 2.3719, "step": 3589500 }, { "epoch": 7.514988109990957, "grad_norm": 16.67429542541504, "learning_rate": 1.2451134018023462e-05, "loss": 2.3522, "step": 3590000 }, { "epoch": 7.516034765716582, "grad_norm": 20.035133361816406, "learning_rate": 1.244588976153317e-05, "loss": 2.365, "step": 3590500 }, { "epoch": 7.5170814214422075, "grad_norm": 19.106050491333008, "learning_rate": 1.2440645505042877e-05, "loss": 2.3653, "step": 3591000 }, { "epoch": 7.518128077167834, "grad_norm": 20.766572952270508, "learning_rate": 1.2435401248552586e-05, "loss": 2.3791, "step": 3591500 }, { "epoch": 7.519174732893459, "grad_norm": 16.700820922851562, "learning_rate": 1.2430156992062294e-05, "loss": 2.3692, "step": 3592000 }, { "epoch": 7.520221388619085, "grad_norm": 17.09307289123535, "learning_rate": 1.2424912735572002e-05, "loss": 2.3694, "step": 3592500 }, { "epoch": 7.52126804434471, "grad_norm": 16.241168975830078, "learning_rate": 1.241966847908171e-05, "loss": 2.3532, "step": 3593000 }, { "epoch": 7.522314700070336, "grad_norm": 17.4859619140625, "learning_rate": 1.2414424222591419e-05, "loss": 2.3686, "step": 3593500 }, { "epoch": 7.523361355795961, "grad_norm": 19.2620792388916, "learning_rate": 1.2409179966101127e-05, "loss": 2.3735, "step": 3594000 }, { "epoch": 7.524408011521587, "grad_norm": 18.308452606201172, "learning_rate": 1.2403935709610836e-05, "loss": 2.3664, "step": 3594500 }, { "epoch": 7.525454667247212, "grad_norm": 16.69866180419922, "learning_rate": 1.2398691453120544e-05, "loss": 2.357, "step": 3595000 }, { "epoch": 7.5265013229728375, "grad_norm": 17.548067092895508, "learning_rate": 1.2393447196630251e-05, "loss": 2.3498, "step": 3595500 }, { "epoch": 7.527547978698463, "grad_norm": 18.86147689819336, "learning_rate": 1.2388202940139959e-05, "loss": 2.3538, "step": 3596000 }, { "epoch": 7.528594634424088, "grad_norm": 18.576934814453125, "learning_rate": 1.2382958683649667e-05, "loss": 2.347, "step": 3596500 }, { "epoch": 7.529641290149714, "grad_norm": 17.344013214111328, "learning_rate": 1.2377714427159376e-05, "loss": 2.3755, "step": 3597000 }, { "epoch": 7.530687945875339, "grad_norm": 18.092241287231445, "learning_rate": 1.2372470170669084e-05, "loss": 2.3626, "step": 3597500 }, { "epoch": 7.531734601600965, "grad_norm": 13.949957847595215, "learning_rate": 1.2367225914178791e-05, "loss": 2.3797, "step": 3598000 }, { "epoch": 7.53278125732659, "grad_norm": 19.102092742919922, "learning_rate": 1.2361981657688499e-05, "loss": 2.3493, "step": 3598500 }, { "epoch": 7.533827913052216, "grad_norm": 19.4199275970459, "learning_rate": 1.2356737401198209e-05, "loss": 2.3512, "step": 3599000 }, { "epoch": 7.534874568777841, "grad_norm": 17.05782699584961, "learning_rate": 1.2351493144707916e-05, "loss": 2.388, "step": 3599500 }, { "epoch": 7.535921224503467, "grad_norm": 17.634098052978516, "learning_rate": 1.2346248888217626e-05, "loss": 2.3657, "step": 3600000 }, { "epoch": 7.536967880229092, "grad_norm": 18.860275268554688, "learning_rate": 1.2341004631727333e-05, "loss": 2.3727, "step": 3600500 }, { "epoch": 7.5380145359547175, "grad_norm": 19.41071128845215, "learning_rate": 1.2335760375237041e-05, "loss": 2.3677, "step": 3601000 }, { "epoch": 7.539061191680343, "grad_norm": 17.926237106323242, "learning_rate": 1.2330516118746749e-05, "loss": 2.3773, "step": 3601500 }, { "epoch": 7.540107847405968, "grad_norm": 22.48310089111328, "learning_rate": 1.2325271862256458e-05, "loss": 2.3624, "step": 3602000 }, { "epoch": 7.541154503131594, "grad_norm": 19.752952575683594, "learning_rate": 1.2320027605766166e-05, "loss": 2.3862, "step": 3602500 }, { "epoch": 7.542201158857219, "grad_norm": 18.984283447265625, "learning_rate": 1.2314783349275873e-05, "loss": 2.3731, "step": 3603000 }, { "epoch": 7.543247814582845, "grad_norm": 16.47755241394043, "learning_rate": 1.2309539092785583e-05, "loss": 2.3717, "step": 3603500 }, { "epoch": 7.54429447030847, "grad_norm": 16.325193405151367, "learning_rate": 1.2304294836295289e-05, "loss": 2.3476, "step": 3604000 }, { "epoch": 7.545341126034096, "grad_norm": 18.66734504699707, "learning_rate": 1.2299050579804998e-05, "loss": 2.3529, "step": 3604500 }, { "epoch": 7.546387781759721, "grad_norm": 17.08601188659668, "learning_rate": 1.2293806323314706e-05, "loss": 2.3838, "step": 3605000 }, { "epoch": 7.547434437485347, "grad_norm": 18.62314796447754, "learning_rate": 1.2288562066824415e-05, "loss": 2.3679, "step": 3605500 }, { "epoch": 7.548481093210972, "grad_norm": 17.15838050842285, "learning_rate": 1.2283317810334123e-05, "loss": 2.3612, "step": 3606000 }, { "epoch": 7.5495277489365975, "grad_norm": 20.050397872924805, "learning_rate": 1.227807355384383e-05, "loss": 2.381, "step": 3606500 }, { "epoch": 7.550574404662223, "grad_norm": 16.784992218017578, "learning_rate": 1.2272829297353538e-05, "loss": 2.3816, "step": 3607000 }, { "epoch": 7.551621060387848, "grad_norm": 19.722684860229492, "learning_rate": 1.2267585040863248e-05, "loss": 2.3717, "step": 3607500 }, { "epoch": 7.552667716113474, "grad_norm": 17.03297233581543, "learning_rate": 1.2262340784372955e-05, "loss": 2.3722, "step": 3608000 }, { "epoch": 7.553714371839099, "grad_norm": 16.95058822631836, "learning_rate": 1.2257096527882663e-05, "loss": 2.3712, "step": 3608500 }, { "epoch": 7.554761027564725, "grad_norm": 19.228254318237305, "learning_rate": 1.2251852271392372e-05, "loss": 2.3771, "step": 3609000 }, { "epoch": 7.55580768329035, "grad_norm": 19.3953914642334, "learning_rate": 1.2246608014902078e-05, "loss": 2.366, "step": 3609500 }, { "epoch": 7.556854339015976, "grad_norm": 17.748098373413086, "learning_rate": 1.2241363758411788e-05, "loss": 2.3563, "step": 3610000 }, { "epoch": 7.557900994741601, "grad_norm": 20.56275749206543, "learning_rate": 1.2236119501921496e-05, "loss": 2.3802, "step": 3610500 }, { "epoch": 7.5589476504672275, "grad_norm": 16.196521759033203, "learning_rate": 1.2230875245431205e-05, "loss": 2.3645, "step": 3611000 }, { "epoch": 7.559994306192853, "grad_norm": 18.95059585571289, "learning_rate": 1.2225630988940913e-05, "loss": 2.3549, "step": 3611500 }, { "epoch": 7.561040961918478, "grad_norm": 20.976158142089844, "learning_rate": 1.2220386732450622e-05, "loss": 2.3719, "step": 3612000 }, { "epoch": 7.562087617644104, "grad_norm": 18.27942657470703, "learning_rate": 1.2215142475960328e-05, "loss": 2.3671, "step": 3612500 }, { "epoch": 7.563134273369729, "grad_norm": 20.306907653808594, "learning_rate": 1.2209898219470037e-05, "loss": 2.352, "step": 3613000 }, { "epoch": 7.564180929095355, "grad_norm": 16.97748374938965, "learning_rate": 1.2204653962979745e-05, "loss": 2.3508, "step": 3613500 }, { "epoch": 7.56522758482098, "grad_norm": 26.085546493530273, "learning_rate": 1.2199409706489453e-05, "loss": 2.3674, "step": 3614000 }, { "epoch": 7.566274240546606, "grad_norm": 19.19788360595703, "learning_rate": 1.2194165449999162e-05, "loss": 2.3525, "step": 3614500 }, { "epoch": 7.567320896272231, "grad_norm": 20.386796951293945, "learning_rate": 1.218892119350887e-05, "loss": 2.3766, "step": 3615000 }, { "epoch": 7.568367551997857, "grad_norm": 22.868650436401367, "learning_rate": 1.2183676937018578e-05, "loss": 2.3626, "step": 3615500 }, { "epoch": 7.569414207723482, "grad_norm": 16.49149513244629, "learning_rate": 1.2178432680528285e-05, "loss": 2.3614, "step": 3616000 }, { "epoch": 7.5704608634491075, "grad_norm": 18.539337158203125, "learning_rate": 1.2173188424037995e-05, "loss": 2.3578, "step": 3616500 }, { "epoch": 7.571507519174733, "grad_norm": 19.37854766845703, "learning_rate": 1.2167944167547702e-05, "loss": 2.3608, "step": 3617000 }, { "epoch": 7.572554174900358, "grad_norm": 21.264598846435547, "learning_rate": 1.2162699911057412e-05, "loss": 2.3711, "step": 3617500 }, { "epoch": 7.573600830625984, "grad_norm": 20.269365310668945, "learning_rate": 1.2157455654567118e-05, "loss": 2.3616, "step": 3618000 }, { "epoch": 7.574647486351609, "grad_norm": 16.642711639404297, "learning_rate": 1.2152211398076827e-05, "loss": 2.3623, "step": 3618500 }, { "epoch": 7.575694142077235, "grad_norm": 23.88625144958496, "learning_rate": 1.2146967141586535e-05, "loss": 2.3651, "step": 3619000 }, { "epoch": 7.57674079780286, "grad_norm": 20.85736083984375, "learning_rate": 1.2141722885096244e-05, "loss": 2.3983, "step": 3619500 }, { "epoch": 7.577787453528486, "grad_norm": 16.808452606201172, "learning_rate": 1.2136478628605952e-05, "loss": 2.3614, "step": 3620000 }, { "epoch": 7.578834109254111, "grad_norm": 18.888755798339844, "learning_rate": 1.213123437211566e-05, "loss": 2.3511, "step": 3620500 }, { "epoch": 7.579880764979737, "grad_norm": 17.91386604309082, "learning_rate": 1.2125990115625367e-05, "loss": 2.3545, "step": 3621000 }, { "epoch": 7.580927420705362, "grad_norm": 18.636363983154297, "learning_rate": 1.2120745859135075e-05, "loss": 2.3608, "step": 3621500 }, { "epoch": 7.5819740764309875, "grad_norm": 21.94869041442871, "learning_rate": 1.2115501602644784e-05, "loss": 2.3526, "step": 3622000 }, { "epoch": 7.583020732156613, "grad_norm": 20.130367279052734, "learning_rate": 1.2110257346154492e-05, "loss": 2.3569, "step": 3622500 }, { "epoch": 7.584067387882238, "grad_norm": 18.55095863342285, "learning_rate": 1.2105013089664201e-05, "loss": 2.3632, "step": 3623000 }, { "epoch": 7.585114043607864, "grad_norm": 22.298194885253906, "learning_rate": 1.2099768833173907e-05, "loss": 2.3862, "step": 3623500 }, { "epoch": 7.586160699333489, "grad_norm": 20.871238708496094, "learning_rate": 1.2094524576683617e-05, "loss": 2.3737, "step": 3624000 }, { "epoch": 7.587207355059115, "grad_norm": 20.466617584228516, "learning_rate": 1.2089280320193324e-05, "loss": 2.382, "step": 3624500 }, { "epoch": 7.58825401078474, "grad_norm": 15.533568382263184, "learning_rate": 1.2084036063703034e-05, "loss": 2.355, "step": 3625000 }, { "epoch": 7.589300666510366, "grad_norm": 18.598712921142578, "learning_rate": 1.2078791807212742e-05, "loss": 2.3675, "step": 3625500 }, { "epoch": 7.590347322235991, "grad_norm": 18.238731384277344, "learning_rate": 1.207354755072245e-05, "loss": 2.3715, "step": 3626000 }, { "epoch": 7.5913939779616175, "grad_norm": 19.878437042236328, "learning_rate": 1.2068303294232157e-05, "loss": 2.3714, "step": 3626500 }, { "epoch": 7.592440633687243, "grad_norm": 18.20253562927246, "learning_rate": 1.2063059037741865e-05, "loss": 2.3538, "step": 3627000 }, { "epoch": 7.593487289412868, "grad_norm": 19.0759334564209, "learning_rate": 1.2057814781251574e-05, "loss": 2.3442, "step": 3627500 }, { "epoch": 7.594533945138494, "grad_norm": 20.819217681884766, "learning_rate": 1.2052570524761282e-05, "loss": 2.363, "step": 3628000 }, { "epoch": 7.595580600864119, "grad_norm": 21.037067413330078, "learning_rate": 1.2047326268270991e-05, "loss": 2.3439, "step": 3628500 }, { "epoch": 7.596627256589745, "grad_norm": 21.912399291992188, "learning_rate": 1.2042082011780699e-05, "loss": 2.3683, "step": 3629000 }, { "epoch": 7.59767391231537, "grad_norm": 18.70072364807129, "learning_rate": 1.2036837755290406e-05, "loss": 2.3644, "step": 3629500 }, { "epoch": 7.598720568040996, "grad_norm": 18.221782684326172, "learning_rate": 1.2031593498800114e-05, "loss": 2.3724, "step": 3630000 }, { "epoch": 7.599767223766621, "grad_norm": 15.4769287109375, "learning_rate": 1.2026349242309824e-05, "loss": 2.3535, "step": 3630500 }, { "epoch": 7.600813879492247, "grad_norm": 17.476726531982422, "learning_rate": 1.2021104985819531e-05, "loss": 2.3691, "step": 3631000 }, { "epoch": 7.601860535217872, "grad_norm": 17.678817749023438, "learning_rate": 1.2015860729329239e-05, "loss": 2.3435, "step": 3631500 }, { "epoch": 7.6029071909434975, "grad_norm": 20.938791275024414, "learning_rate": 1.2010616472838947e-05, "loss": 2.3613, "step": 3632000 }, { "epoch": 7.603953846669123, "grad_norm": 15.930994987487793, "learning_rate": 1.2005372216348654e-05, "loss": 2.3806, "step": 3632500 }, { "epoch": 7.6050005023947485, "grad_norm": 18.165674209594727, "learning_rate": 1.2000127959858364e-05, "loss": 2.369, "step": 3633000 }, { "epoch": 7.606047158120374, "grad_norm": 18.453649520874023, "learning_rate": 1.1994883703368071e-05, "loss": 2.3735, "step": 3633500 }, { "epoch": 7.607093813845999, "grad_norm": 19.2774658203125, "learning_rate": 1.198963944687778e-05, "loss": 2.3694, "step": 3634000 }, { "epoch": 7.608140469571625, "grad_norm": 17.78990364074707, "learning_rate": 1.1984395190387488e-05, "loss": 2.3745, "step": 3634500 }, { "epoch": 7.60918712529725, "grad_norm": 17.257160186767578, "learning_rate": 1.1979150933897196e-05, "loss": 2.357, "step": 3635000 }, { "epoch": 7.610233781022876, "grad_norm": 18.698284149169922, "learning_rate": 1.1973906677406904e-05, "loss": 2.3579, "step": 3635500 }, { "epoch": 7.611280436748501, "grad_norm": 17.495119094848633, "learning_rate": 1.1968662420916613e-05, "loss": 2.3692, "step": 3636000 }, { "epoch": 7.612327092474127, "grad_norm": 18.401201248168945, "learning_rate": 1.1963418164426321e-05, "loss": 2.3785, "step": 3636500 }, { "epoch": 7.613373748199752, "grad_norm": 18.269180297851562, "learning_rate": 1.1958173907936029e-05, "loss": 2.375, "step": 3637000 }, { "epoch": 7.614420403925378, "grad_norm": 16.095949172973633, "learning_rate": 1.1952929651445738e-05, "loss": 2.3437, "step": 3637500 }, { "epoch": 7.615467059651003, "grad_norm": 17.640535354614258, "learning_rate": 1.1947685394955446e-05, "loss": 2.3592, "step": 3638000 }, { "epoch": 7.6165137153766285, "grad_norm": 15.515578269958496, "learning_rate": 1.1942441138465153e-05, "loss": 2.3541, "step": 3638500 }, { "epoch": 7.617560371102254, "grad_norm": 18.559133529663086, "learning_rate": 1.1937196881974861e-05, "loss": 2.3671, "step": 3639000 }, { "epoch": 7.618607026827879, "grad_norm": 16.17188835144043, "learning_rate": 1.193195262548457e-05, "loss": 2.3572, "step": 3639500 }, { "epoch": 7.619653682553505, "grad_norm": 19.467872619628906, "learning_rate": 1.1926708368994278e-05, "loss": 2.3436, "step": 3640000 }, { "epoch": 7.62070033827913, "grad_norm": 18.298276901245117, "learning_rate": 1.1921464112503986e-05, "loss": 2.3457, "step": 3640500 }, { "epoch": 7.621746994004756, "grad_norm": 18.593717575073242, "learning_rate": 1.1916219856013694e-05, "loss": 2.3466, "step": 3641000 }, { "epoch": 7.622793649730381, "grad_norm": 18.315298080444336, "learning_rate": 1.1910975599523403e-05, "loss": 2.3531, "step": 3641500 }, { "epoch": 7.623840305456007, "grad_norm": 21.272993087768555, "learning_rate": 1.190573134303311e-05, "loss": 2.3512, "step": 3642000 }, { "epoch": 7.624886961181632, "grad_norm": 16.594032287597656, "learning_rate": 1.190048708654282e-05, "loss": 2.3423, "step": 3642500 }, { "epoch": 7.625933616907258, "grad_norm": 16.997394561767578, "learning_rate": 1.1895242830052528e-05, "loss": 2.3743, "step": 3643000 }, { "epoch": 7.626980272632883, "grad_norm": 18.439640045166016, "learning_rate": 1.1889998573562235e-05, "loss": 2.3593, "step": 3643500 }, { "epoch": 7.6280269283585085, "grad_norm": 20.040437698364258, "learning_rate": 1.1884754317071943e-05, "loss": 2.3618, "step": 3644000 }, { "epoch": 7.629073584084134, "grad_norm": 15.890726089477539, "learning_rate": 1.187951006058165e-05, "loss": 2.3752, "step": 3644500 }, { "epoch": 7.630120239809759, "grad_norm": 15.057406425476074, "learning_rate": 1.187426580409136e-05, "loss": 2.3721, "step": 3645000 }, { "epoch": 7.631166895535385, "grad_norm": 19.938528060913086, "learning_rate": 1.1869021547601068e-05, "loss": 2.3544, "step": 3645500 }, { "epoch": 7.632213551261011, "grad_norm": 15.84771728515625, "learning_rate": 1.1863777291110777e-05, "loss": 2.3582, "step": 3646000 }, { "epoch": 7.633260206986637, "grad_norm": 18.338794708251953, "learning_rate": 1.1858533034620483e-05, "loss": 2.3596, "step": 3646500 }, { "epoch": 7.634306862712262, "grad_norm": 16.82748794555664, "learning_rate": 1.1853288778130193e-05, "loss": 2.3652, "step": 3647000 }, { "epoch": 7.635353518437888, "grad_norm": 16.245542526245117, "learning_rate": 1.18480445216399e-05, "loss": 2.3565, "step": 3647500 }, { "epoch": 7.636400174163513, "grad_norm": 17.462263107299805, "learning_rate": 1.184280026514961e-05, "loss": 2.3625, "step": 3648000 }, { "epoch": 7.6374468298891385, "grad_norm": 16.373626708984375, "learning_rate": 1.1837556008659317e-05, "loss": 2.361, "step": 3648500 }, { "epoch": 7.638493485614764, "grad_norm": 22.21917152404785, "learning_rate": 1.1832311752169025e-05, "loss": 2.3399, "step": 3649000 }, { "epoch": 7.639540141340389, "grad_norm": 19.353469848632812, "learning_rate": 1.1827067495678733e-05, "loss": 2.3693, "step": 3649500 }, { "epoch": 7.640586797066015, "grad_norm": 20.30394744873047, "learning_rate": 1.182182323918844e-05, "loss": 2.3533, "step": 3650000 }, { "epoch": 7.64163345279164, "grad_norm": 17.305953979492188, "learning_rate": 1.181657898269815e-05, "loss": 2.3935, "step": 3650500 }, { "epoch": 7.642680108517266, "grad_norm": 17.269451141357422, "learning_rate": 1.1811334726207858e-05, "loss": 2.391, "step": 3651000 }, { "epoch": 7.643726764242891, "grad_norm": 19.8678035736084, "learning_rate": 1.1806090469717567e-05, "loss": 2.371, "step": 3651500 }, { "epoch": 7.644773419968517, "grad_norm": 17.199329376220703, "learning_rate": 1.1800846213227273e-05, "loss": 2.3463, "step": 3652000 }, { "epoch": 7.645820075694142, "grad_norm": 18.889606475830078, "learning_rate": 1.1795601956736982e-05, "loss": 2.3489, "step": 3652500 }, { "epoch": 7.646866731419768, "grad_norm": 16.89557647705078, "learning_rate": 1.179035770024669e-05, "loss": 2.3667, "step": 3653000 }, { "epoch": 7.647913387145393, "grad_norm": 17.799667358398438, "learning_rate": 1.17851134437564e-05, "loss": 2.3769, "step": 3653500 }, { "epoch": 7.6489600428710185, "grad_norm": 21.76291847229004, "learning_rate": 1.1779869187266107e-05, "loss": 2.3433, "step": 3654000 }, { "epoch": 7.650006698596644, "grad_norm": 22.810142517089844, "learning_rate": 1.1774624930775815e-05, "loss": 2.3582, "step": 3654500 }, { "epoch": 7.651053354322269, "grad_norm": 17.62801170349121, "learning_rate": 1.1769380674285522e-05, "loss": 2.3471, "step": 3655000 }, { "epoch": 7.652100010047895, "grad_norm": 21.134357452392578, "learning_rate": 1.176413641779523e-05, "loss": 2.3446, "step": 3655500 }, { "epoch": 7.65314666577352, "grad_norm": 16.247737884521484, "learning_rate": 1.175889216130494e-05, "loss": 2.3702, "step": 3656000 }, { "epoch": 7.654193321499146, "grad_norm": 20.159643173217773, "learning_rate": 1.1753647904814647e-05, "loss": 2.348, "step": 3656500 }, { "epoch": 7.655239977224771, "grad_norm": 18.791141510009766, "learning_rate": 1.1748403648324357e-05, "loss": 2.3669, "step": 3657000 }, { "epoch": 7.656286632950397, "grad_norm": 18.721725463867188, "learning_rate": 1.1743159391834063e-05, "loss": 2.3752, "step": 3657500 }, { "epoch": 7.657333288676022, "grad_norm": 17.107906341552734, "learning_rate": 1.1737915135343772e-05, "loss": 2.3448, "step": 3658000 }, { "epoch": 7.658379944401648, "grad_norm": 18.652019500732422, "learning_rate": 1.173267087885348e-05, "loss": 2.3588, "step": 3658500 }, { "epoch": 7.659426600127273, "grad_norm": 19.129650115966797, "learning_rate": 1.1727426622363189e-05, "loss": 2.3684, "step": 3659000 }, { "epoch": 7.6604732558528985, "grad_norm": 16.91480827331543, "learning_rate": 1.1722182365872897e-05, "loss": 2.3713, "step": 3659500 }, { "epoch": 7.661519911578524, "grad_norm": 15.838888168334961, "learning_rate": 1.1716938109382606e-05, "loss": 2.3667, "step": 3660000 }, { "epoch": 7.662566567304149, "grad_norm": 18.7891845703125, "learning_rate": 1.1711693852892312e-05, "loss": 2.3431, "step": 3660500 }, { "epoch": 7.663613223029775, "grad_norm": 15.992749214172363, "learning_rate": 1.1706449596402022e-05, "loss": 2.3692, "step": 3661000 }, { "epoch": 7.664659878755401, "grad_norm": 23.470352172851562, "learning_rate": 1.170120533991173e-05, "loss": 2.3707, "step": 3661500 }, { "epoch": 7.665706534481027, "grad_norm": 19.545278549194336, "learning_rate": 1.1695961083421437e-05, "loss": 2.3441, "step": 3662000 }, { "epoch": 7.666753190206652, "grad_norm": 16.873783111572266, "learning_rate": 1.1690716826931146e-05, "loss": 2.3553, "step": 3662500 }, { "epoch": 7.667799845932278, "grad_norm": 17.316932678222656, "learning_rate": 1.1685472570440854e-05, "loss": 2.3598, "step": 3663000 }, { "epoch": 7.668846501657903, "grad_norm": 19.423526763916016, "learning_rate": 1.1680228313950562e-05, "loss": 2.3645, "step": 3663500 }, { "epoch": 7.6698931573835285, "grad_norm": 18.223833084106445, "learning_rate": 1.167498405746027e-05, "loss": 2.359, "step": 3664000 }, { "epoch": 7.670939813109154, "grad_norm": 19.476131439208984, "learning_rate": 1.1669739800969979e-05, "loss": 2.329, "step": 3664500 }, { "epoch": 7.671986468834779, "grad_norm": 19.099063873291016, "learning_rate": 1.1664495544479686e-05, "loss": 2.3685, "step": 3665000 }, { "epoch": 7.673033124560405, "grad_norm": 18.447948455810547, "learning_rate": 1.1659251287989396e-05, "loss": 2.3488, "step": 3665500 }, { "epoch": 7.67407978028603, "grad_norm": 24.955860137939453, "learning_rate": 1.1654007031499102e-05, "loss": 2.3584, "step": 3666000 }, { "epoch": 7.675126436011656, "grad_norm": 23.09219741821289, "learning_rate": 1.1648762775008811e-05, "loss": 2.3665, "step": 3666500 }, { "epoch": 7.676173091737281, "grad_norm": 18.7744140625, "learning_rate": 1.1643518518518519e-05, "loss": 2.3612, "step": 3667000 }, { "epoch": 7.677219747462907, "grad_norm": 18.834327697753906, "learning_rate": 1.1638274262028227e-05, "loss": 2.3676, "step": 3667500 }, { "epoch": 7.678266403188532, "grad_norm": 21.33323097229004, "learning_rate": 1.1633030005537936e-05, "loss": 2.3515, "step": 3668000 }, { "epoch": 7.679313058914158, "grad_norm": 18.40013313293457, "learning_rate": 1.1627785749047644e-05, "loss": 2.3793, "step": 3668500 }, { "epoch": 7.680359714639783, "grad_norm": 16.421480178833008, "learning_rate": 1.1622541492557351e-05, "loss": 2.3403, "step": 3669000 }, { "epoch": 7.6814063703654085, "grad_norm": 19.095006942749023, "learning_rate": 1.1617297236067059e-05, "loss": 2.3518, "step": 3669500 }, { "epoch": 7.682453026091034, "grad_norm": 16.98341178894043, "learning_rate": 1.1612052979576768e-05, "loss": 2.3613, "step": 3670000 }, { "epoch": 7.683499681816659, "grad_norm": 18.967077255249023, "learning_rate": 1.1606808723086476e-05, "loss": 2.3575, "step": 3670500 }, { "epoch": 7.684546337542285, "grad_norm": 15.969877243041992, "learning_rate": 1.1601564466596185e-05, "loss": 2.3574, "step": 3671000 }, { "epoch": 7.68559299326791, "grad_norm": 19.958887100219727, "learning_rate": 1.1596320210105893e-05, "loss": 2.3588, "step": 3671500 }, { "epoch": 7.686639648993536, "grad_norm": 19.767635345458984, "learning_rate": 1.1591075953615601e-05, "loss": 2.3532, "step": 3672000 }, { "epoch": 7.687686304719161, "grad_norm": 20.656356811523438, "learning_rate": 1.1585831697125309e-05, "loss": 2.3745, "step": 3672500 }, { "epoch": 7.688732960444787, "grad_norm": 15.708616256713867, "learning_rate": 1.1580587440635016e-05, "loss": 2.3672, "step": 3673000 }, { "epoch": 7.689779616170412, "grad_norm": 17.191869735717773, "learning_rate": 1.1575343184144726e-05, "loss": 2.3607, "step": 3673500 }, { "epoch": 7.690826271896038, "grad_norm": 21.993152618408203, "learning_rate": 1.1570098927654433e-05, "loss": 2.3558, "step": 3674000 }, { "epoch": 7.691872927621663, "grad_norm": 23.4174747467041, "learning_rate": 1.1564854671164141e-05, "loss": 2.3453, "step": 3674500 }, { "epoch": 7.6929195833472885, "grad_norm": 21.43183135986328, "learning_rate": 1.1559610414673849e-05, "loss": 2.3617, "step": 3675000 }, { "epoch": 7.693966239072914, "grad_norm": 17.522981643676758, "learning_rate": 1.1554366158183558e-05, "loss": 2.3592, "step": 3675500 }, { "epoch": 7.6950128947985394, "grad_norm": 16.33531379699707, "learning_rate": 1.1549121901693266e-05, "loss": 2.3576, "step": 3676000 }, { "epoch": 7.696059550524165, "grad_norm": 19.36294937133789, "learning_rate": 1.1543877645202975e-05, "loss": 2.3667, "step": 3676500 }, { "epoch": 7.69710620624979, "grad_norm": 18.338224411010742, "learning_rate": 1.1538633388712683e-05, "loss": 2.349, "step": 3677000 }, { "epoch": 7.698152861975416, "grad_norm": 17.366331100463867, "learning_rate": 1.153338913222239e-05, "loss": 2.339, "step": 3677500 }, { "epoch": 7.699199517701041, "grad_norm": 18.111602783203125, "learning_rate": 1.1528144875732098e-05, "loss": 2.3869, "step": 3678000 }, { "epoch": 7.700246173426667, "grad_norm": 20.09990119934082, "learning_rate": 1.1522900619241808e-05, "loss": 2.3587, "step": 3678500 }, { "epoch": 7.701292829152292, "grad_norm": 28.811643600463867, "learning_rate": 1.1517656362751515e-05, "loss": 2.3626, "step": 3679000 }, { "epoch": 7.702339484877918, "grad_norm": 18.588706970214844, "learning_rate": 1.1512412106261223e-05, "loss": 2.3855, "step": 3679500 }, { "epoch": 7.703386140603543, "grad_norm": 18.41972541809082, "learning_rate": 1.1507167849770932e-05, "loss": 2.3516, "step": 3680000 }, { "epoch": 7.7044327963291686, "grad_norm": 16.272958755493164, "learning_rate": 1.1501923593280638e-05, "loss": 2.3388, "step": 3680500 }, { "epoch": 7.705479452054795, "grad_norm": 21.909189224243164, "learning_rate": 1.1496679336790348e-05, "loss": 2.3503, "step": 3681000 }, { "epoch": 7.70652610778042, "grad_norm": 20.252775192260742, "learning_rate": 1.1491435080300055e-05, "loss": 2.3531, "step": 3681500 }, { "epoch": 7.707572763506046, "grad_norm": 16.76420783996582, "learning_rate": 1.1486190823809765e-05, "loss": 2.3474, "step": 3682000 }, { "epoch": 7.708619419231671, "grad_norm": 20.906726837158203, "learning_rate": 1.1480946567319473e-05, "loss": 2.3592, "step": 3682500 }, { "epoch": 7.709666074957297, "grad_norm": 17.19232749938965, "learning_rate": 1.1475702310829182e-05, "loss": 2.3685, "step": 3683000 }, { "epoch": 7.710712730682922, "grad_norm": 14.278427124023438, "learning_rate": 1.1470458054338888e-05, "loss": 2.3447, "step": 3683500 }, { "epoch": 7.711759386408548, "grad_norm": 15.728169441223145, "learning_rate": 1.1465213797848597e-05, "loss": 2.3476, "step": 3684000 }, { "epoch": 7.712806042134173, "grad_norm": 20.35359001159668, "learning_rate": 1.1459969541358305e-05, "loss": 2.3592, "step": 3684500 }, { "epoch": 7.7138526978597985, "grad_norm": 18.300125122070312, "learning_rate": 1.1454725284868013e-05, "loss": 2.3561, "step": 3685000 }, { "epoch": 7.714899353585424, "grad_norm": 22.059646606445312, "learning_rate": 1.1449481028377722e-05, "loss": 2.3445, "step": 3685500 }, { "epoch": 7.7159460093110495, "grad_norm": 17.134469985961914, "learning_rate": 1.1444236771887428e-05, "loss": 2.3561, "step": 3686000 }, { "epoch": 7.716992665036675, "grad_norm": 17.755897521972656, "learning_rate": 1.1438992515397137e-05, "loss": 2.3436, "step": 3686500 }, { "epoch": 7.7180393207623, "grad_norm": 15.584162712097168, "learning_rate": 1.1433748258906845e-05, "loss": 2.3666, "step": 3687000 }, { "epoch": 7.719085976487926, "grad_norm": 18.02531623840332, "learning_rate": 1.1428504002416555e-05, "loss": 2.3688, "step": 3687500 }, { "epoch": 7.720132632213551, "grad_norm": 17.130023956298828, "learning_rate": 1.1423259745926262e-05, "loss": 2.3529, "step": 3688000 }, { "epoch": 7.721179287939177, "grad_norm": 19.28011131286621, "learning_rate": 1.1418015489435972e-05, "loss": 2.3481, "step": 3688500 }, { "epoch": 7.722225943664802, "grad_norm": 17.481884002685547, "learning_rate": 1.1412771232945678e-05, "loss": 2.3641, "step": 3689000 }, { "epoch": 7.723272599390428, "grad_norm": 19.037622451782227, "learning_rate": 1.1407526976455387e-05, "loss": 2.3533, "step": 3689500 }, { "epoch": 7.724319255116053, "grad_norm": 17.785966873168945, "learning_rate": 1.1402282719965095e-05, "loss": 2.3523, "step": 3690000 }, { "epoch": 7.725365910841679, "grad_norm": 18.648605346679688, "learning_rate": 1.1397038463474802e-05, "loss": 2.3538, "step": 3690500 }, { "epoch": 7.726412566567304, "grad_norm": 20.384918212890625, "learning_rate": 1.1391794206984512e-05, "loss": 2.3654, "step": 3691000 }, { "epoch": 7.7274592222929295, "grad_norm": 19.3894100189209, "learning_rate": 1.138654995049422e-05, "loss": 2.3471, "step": 3691500 }, { "epoch": 7.728505878018555, "grad_norm": 19.03530502319336, "learning_rate": 1.1381305694003927e-05, "loss": 2.3542, "step": 3692000 }, { "epoch": 7.72955253374418, "grad_norm": 15.600479125976562, "learning_rate": 1.1376061437513635e-05, "loss": 2.3698, "step": 3692500 }, { "epoch": 7.730599189469806, "grad_norm": 18.371244430541992, "learning_rate": 1.1370817181023344e-05, "loss": 2.3385, "step": 3693000 }, { "epoch": 7.731645845195431, "grad_norm": 20.844911575317383, "learning_rate": 1.1365572924533052e-05, "loss": 2.3379, "step": 3693500 }, { "epoch": 7.732692500921057, "grad_norm": 16.274669647216797, "learning_rate": 1.1360328668042761e-05, "loss": 2.3571, "step": 3694000 }, { "epoch": 7.733739156646682, "grad_norm": 15.483747482299805, "learning_rate": 1.1355084411552467e-05, "loss": 2.3562, "step": 3694500 }, { "epoch": 7.734785812372308, "grad_norm": 22.432703018188477, "learning_rate": 1.1349840155062177e-05, "loss": 2.3712, "step": 3695000 }, { "epoch": 7.735832468097933, "grad_norm": 20.35329818725586, "learning_rate": 1.1344595898571884e-05, "loss": 2.364, "step": 3695500 }, { "epoch": 7.736879123823559, "grad_norm": 19.028806686401367, "learning_rate": 1.1339351642081592e-05, "loss": 2.3471, "step": 3696000 }, { "epoch": 7.737925779549185, "grad_norm": 15.575340270996094, "learning_rate": 1.1334107385591301e-05, "loss": 2.3522, "step": 3696500 }, { "epoch": 7.73897243527481, "grad_norm": 30.349750518798828, "learning_rate": 1.1328863129101009e-05, "loss": 2.3561, "step": 3697000 }, { "epoch": 7.740019091000436, "grad_norm": 17.399864196777344, "learning_rate": 1.1323618872610717e-05, "loss": 2.353, "step": 3697500 }, { "epoch": 7.741065746726061, "grad_norm": 19.754863739013672, "learning_rate": 1.1318374616120425e-05, "loss": 2.3579, "step": 3698000 }, { "epoch": 7.742112402451687, "grad_norm": 16.71983528137207, "learning_rate": 1.1313130359630134e-05, "loss": 2.342, "step": 3698500 }, { "epoch": 7.743159058177312, "grad_norm": 22.57883071899414, "learning_rate": 1.1307886103139842e-05, "loss": 2.339, "step": 3699000 }, { "epoch": 7.744205713902938, "grad_norm": 18.92791748046875, "learning_rate": 1.1302641846649551e-05, "loss": 2.3548, "step": 3699500 }, { "epoch": 7.745252369628563, "grad_norm": 19.02851104736328, "learning_rate": 1.1297397590159259e-05, "loss": 2.3573, "step": 3700000 }, { "epoch": 7.746299025354189, "grad_norm": 18.90933609008789, "learning_rate": 1.1292153333668966e-05, "loss": 2.3432, "step": 3700500 }, { "epoch": 7.747345681079814, "grad_norm": 16.520296096801758, "learning_rate": 1.1286909077178674e-05, "loss": 2.3539, "step": 3701000 }, { "epoch": 7.7483923368054395, "grad_norm": 16.531848907470703, "learning_rate": 1.1281664820688383e-05, "loss": 2.3547, "step": 3701500 }, { "epoch": 7.749438992531065, "grad_norm": 28.066631317138672, "learning_rate": 1.1276420564198091e-05, "loss": 2.3492, "step": 3702000 }, { "epoch": 7.75048564825669, "grad_norm": 17.99880027770996, "learning_rate": 1.1271176307707799e-05, "loss": 2.3392, "step": 3702500 }, { "epoch": 7.751532303982316, "grad_norm": 17.806068420410156, "learning_rate": 1.1265932051217507e-05, "loss": 2.3576, "step": 3703000 }, { "epoch": 7.752578959707941, "grad_norm": 17.589574813842773, "learning_rate": 1.1260687794727214e-05, "loss": 2.3597, "step": 3703500 }, { "epoch": 7.753625615433567, "grad_norm": 16.03114891052246, "learning_rate": 1.1255443538236924e-05, "loss": 2.3647, "step": 3704000 }, { "epoch": 7.754672271159192, "grad_norm": 21.40427017211914, "learning_rate": 1.1250199281746631e-05, "loss": 2.3636, "step": 3704500 }, { "epoch": 7.755718926884818, "grad_norm": 24.520709991455078, "learning_rate": 1.124495502525634e-05, "loss": 2.3384, "step": 3705000 }, { "epoch": 7.756765582610443, "grad_norm": 17.414968490600586, "learning_rate": 1.1239710768766048e-05, "loss": 2.3427, "step": 3705500 }, { "epoch": 7.757812238336069, "grad_norm": 17.50424575805664, "learning_rate": 1.1234466512275756e-05, "loss": 2.3509, "step": 3706000 }, { "epoch": 7.758858894061694, "grad_norm": 17.993192672729492, "learning_rate": 1.1229222255785464e-05, "loss": 2.3285, "step": 3706500 }, { "epoch": 7.7599055497873195, "grad_norm": 19.863784790039062, "learning_rate": 1.1223977999295173e-05, "loss": 2.3643, "step": 3707000 }, { "epoch": 7.760952205512945, "grad_norm": 15.400320053100586, "learning_rate": 1.121873374280488e-05, "loss": 2.3357, "step": 3707500 }, { "epoch": 7.76199886123857, "grad_norm": 16.2596492767334, "learning_rate": 1.1213489486314589e-05, "loss": 2.3562, "step": 3708000 }, { "epoch": 7.763045516964196, "grad_norm": 16.054391860961914, "learning_rate": 1.1208245229824298e-05, "loss": 2.3527, "step": 3708500 }, { "epoch": 7.764092172689821, "grad_norm": 18.195226669311523, "learning_rate": 1.1203000973334004e-05, "loss": 2.3578, "step": 3709000 }, { "epoch": 7.765138828415447, "grad_norm": 19.890512466430664, "learning_rate": 1.1197756716843713e-05, "loss": 2.3473, "step": 3709500 }, { "epoch": 7.766185484141072, "grad_norm": 21.69033432006836, "learning_rate": 1.1192512460353421e-05, "loss": 2.3631, "step": 3710000 }, { "epoch": 7.767232139866698, "grad_norm": 16.075416564941406, "learning_rate": 1.118726820386313e-05, "loss": 2.3677, "step": 3710500 }, { "epoch": 7.768278795592323, "grad_norm": 19.018665313720703, "learning_rate": 1.1182023947372838e-05, "loss": 2.351, "step": 3711000 }, { "epoch": 7.769325451317949, "grad_norm": 18.86760902404785, "learning_rate": 1.1176779690882546e-05, "loss": 2.3594, "step": 3711500 }, { "epoch": 7.770372107043574, "grad_norm": 19.232553482055664, "learning_rate": 1.1171535434392253e-05, "loss": 2.3499, "step": 3712000 }, { "epoch": 7.7714187627691995, "grad_norm": 18.58547592163086, "learning_rate": 1.1166291177901963e-05, "loss": 2.363, "step": 3712500 }, { "epoch": 7.772465418494825, "grad_norm": 20.20587921142578, "learning_rate": 1.116104692141167e-05, "loss": 2.3578, "step": 3713000 }, { "epoch": 7.77351207422045, "grad_norm": 18.233787536621094, "learning_rate": 1.1155802664921378e-05, "loss": 2.3509, "step": 3713500 }, { "epoch": 7.774558729946076, "grad_norm": 17.665245056152344, "learning_rate": 1.1150558408431088e-05, "loss": 2.3417, "step": 3714000 }, { "epoch": 7.775605385671701, "grad_norm": 16.116254806518555, "learning_rate": 1.1145314151940794e-05, "loss": 2.3461, "step": 3714500 }, { "epoch": 7.776652041397327, "grad_norm": 19.210765838623047, "learning_rate": 1.1140069895450503e-05, "loss": 2.3373, "step": 3715000 }, { "epoch": 7.777698697122952, "grad_norm": 17.125244140625, "learning_rate": 1.113482563896021e-05, "loss": 2.3691, "step": 3715500 }, { "epoch": 7.778745352848579, "grad_norm": 16.76175308227539, "learning_rate": 1.112958138246992e-05, "loss": 2.3522, "step": 3716000 }, { "epoch": 7.779792008574204, "grad_norm": 16.621335983276367, "learning_rate": 1.1124337125979628e-05, "loss": 2.3571, "step": 3716500 }, { "epoch": 7.7808386642998295, "grad_norm": 18.52570343017578, "learning_rate": 1.1119092869489337e-05, "loss": 2.3605, "step": 3717000 }, { "epoch": 7.781885320025455, "grad_norm": 18.76047706604004, "learning_rate": 1.1113848612999043e-05, "loss": 2.3521, "step": 3717500 }, { "epoch": 7.78293197575108, "grad_norm": 16.919706344604492, "learning_rate": 1.1108604356508753e-05, "loss": 2.3478, "step": 3718000 }, { "epoch": 7.783978631476706, "grad_norm": 16.711227416992188, "learning_rate": 1.110336010001846e-05, "loss": 2.3562, "step": 3718500 }, { "epoch": 7.785025287202331, "grad_norm": 16.556482315063477, "learning_rate": 1.109811584352817e-05, "loss": 2.3574, "step": 3719000 }, { "epoch": 7.786071942927957, "grad_norm": 18.650997161865234, "learning_rate": 1.1092871587037877e-05, "loss": 2.3465, "step": 3719500 }, { "epoch": 7.787118598653582, "grad_norm": 20.190357208251953, "learning_rate": 1.1087627330547585e-05, "loss": 2.3515, "step": 3720000 }, { "epoch": 7.788165254379208, "grad_norm": 21.965822219848633, "learning_rate": 1.1082383074057293e-05, "loss": 2.3495, "step": 3720500 }, { "epoch": 7.789211910104833, "grad_norm": 26.1800479888916, "learning_rate": 1.1077138817567e-05, "loss": 2.3434, "step": 3721000 }, { "epoch": 7.790258565830459, "grad_norm": 18.732948303222656, "learning_rate": 1.107189456107671e-05, "loss": 2.3531, "step": 3721500 }, { "epoch": 7.791305221556084, "grad_norm": 17.104032516479492, "learning_rate": 1.1066650304586417e-05, "loss": 2.3528, "step": 3722000 }, { "epoch": 7.7923518772817095, "grad_norm": 16.614961624145508, "learning_rate": 1.1061406048096127e-05, "loss": 2.3389, "step": 3722500 }, { "epoch": 7.793398533007335, "grad_norm": 19.382495880126953, "learning_rate": 1.1056161791605833e-05, "loss": 2.3303, "step": 3723000 }, { "epoch": 7.79444518873296, "grad_norm": 19.604633331298828, "learning_rate": 1.1050917535115542e-05, "loss": 2.3479, "step": 3723500 }, { "epoch": 7.795491844458586, "grad_norm": 17.984844207763672, "learning_rate": 1.104567327862525e-05, "loss": 2.3492, "step": 3724000 }, { "epoch": 7.796538500184211, "grad_norm": 15.836247444152832, "learning_rate": 1.104042902213496e-05, "loss": 2.3458, "step": 3724500 }, { "epoch": 7.797585155909837, "grad_norm": 17.646656036376953, "learning_rate": 1.1035184765644667e-05, "loss": 2.353, "step": 3725000 }, { "epoch": 7.798631811635462, "grad_norm": 20.888530731201172, "learning_rate": 1.1029940509154375e-05, "loss": 2.3563, "step": 3725500 }, { "epoch": 7.799678467361088, "grad_norm": 18.414316177368164, "learning_rate": 1.1024696252664082e-05, "loss": 2.3454, "step": 3726000 }, { "epoch": 7.800725123086713, "grad_norm": 15.710043907165527, "learning_rate": 1.101945199617379e-05, "loss": 2.3512, "step": 3726500 }, { "epoch": 7.801771778812339, "grad_norm": 20.988113403320312, "learning_rate": 1.10142077396835e-05, "loss": 2.3653, "step": 3727000 }, { "epoch": 7.802818434537964, "grad_norm": 20.866519927978516, "learning_rate": 1.1008963483193207e-05, "loss": 2.3595, "step": 3727500 }, { "epoch": 7.8038650902635895, "grad_norm": 18.514890670776367, "learning_rate": 1.1003719226702916e-05, "loss": 2.3508, "step": 3728000 }, { "epoch": 7.804911745989215, "grad_norm": 17.095462799072266, "learning_rate": 1.0998474970212622e-05, "loss": 2.3545, "step": 3728500 }, { "epoch": 7.8059584017148405, "grad_norm": 18.070148468017578, "learning_rate": 1.0993230713722332e-05, "loss": 2.3685, "step": 3729000 }, { "epoch": 7.807005057440466, "grad_norm": 17.24912452697754, "learning_rate": 1.098798645723204e-05, "loss": 2.3532, "step": 3729500 }, { "epoch": 7.808051713166091, "grad_norm": 17.69502830505371, "learning_rate": 1.0982742200741749e-05, "loss": 2.3484, "step": 3730000 }, { "epoch": 7.809098368891717, "grad_norm": 18.81271743774414, "learning_rate": 1.0977497944251457e-05, "loss": 2.3454, "step": 3730500 }, { "epoch": 7.810145024617342, "grad_norm": 17.12797737121582, "learning_rate": 1.0972253687761164e-05, "loss": 2.3417, "step": 3731000 }, { "epoch": 7.811191680342969, "grad_norm": 19.66063117980957, "learning_rate": 1.0967009431270872e-05, "loss": 2.3526, "step": 3731500 }, { "epoch": 7.812238336068594, "grad_norm": 20.10390853881836, "learning_rate": 1.096176517478058e-05, "loss": 2.3498, "step": 3732000 }, { "epoch": 7.8132849917942195, "grad_norm": 20.286861419677734, "learning_rate": 1.0956520918290289e-05, "loss": 2.3355, "step": 3732500 }, { "epoch": 7.814331647519845, "grad_norm": 17.094507217407227, "learning_rate": 1.0951276661799997e-05, "loss": 2.3627, "step": 3733000 }, { "epoch": 7.8153783032454704, "grad_norm": 18.580276489257812, "learning_rate": 1.0946032405309706e-05, "loss": 2.3452, "step": 3733500 }, { "epoch": 7.816424958971096, "grad_norm": 18.769851684570312, "learning_rate": 1.0940788148819414e-05, "loss": 2.3448, "step": 3734000 }, { "epoch": 7.817471614696721, "grad_norm": 18.829206466674805, "learning_rate": 1.0935543892329122e-05, "loss": 2.3595, "step": 3734500 }, { "epoch": 7.818518270422347, "grad_norm": 19.058380126953125, "learning_rate": 1.093029963583883e-05, "loss": 2.3665, "step": 3735000 }, { "epoch": 7.819564926147972, "grad_norm": 19.94114112854004, "learning_rate": 1.0925055379348539e-05, "loss": 2.3377, "step": 3735500 }, { "epoch": 7.820611581873598, "grad_norm": 18.704959869384766, "learning_rate": 1.0919811122858246e-05, "loss": 2.3774, "step": 3736000 }, { "epoch": 7.821658237599223, "grad_norm": 18.92997932434082, "learning_rate": 1.0914566866367954e-05, "loss": 2.3628, "step": 3736500 }, { "epoch": 7.822704893324849, "grad_norm": 17.764631271362305, "learning_rate": 1.0909322609877662e-05, "loss": 2.3427, "step": 3737000 }, { "epoch": 7.823751549050474, "grad_norm": 20.821386337280273, "learning_rate": 1.0904078353387371e-05, "loss": 2.3384, "step": 3737500 }, { "epoch": 7.8247982047760996, "grad_norm": 18.331871032714844, "learning_rate": 1.0898834096897079e-05, "loss": 2.34, "step": 3738000 }, { "epoch": 7.825844860501725, "grad_norm": 17.275400161743164, "learning_rate": 1.0893589840406786e-05, "loss": 2.3638, "step": 3738500 }, { "epoch": 7.8268915162273505, "grad_norm": 16.966968536376953, "learning_rate": 1.0888345583916496e-05, "loss": 2.3671, "step": 3739000 }, { "epoch": 7.827938171952976, "grad_norm": 16.452329635620117, "learning_rate": 1.0883101327426204e-05, "loss": 2.3446, "step": 3739500 }, { "epoch": 7.828984827678601, "grad_norm": 21.027915954589844, "learning_rate": 1.0877857070935911e-05, "loss": 2.3516, "step": 3740000 }, { "epoch": 7.830031483404227, "grad_norm": 17.602638244628906, "learning_rate": 1.0872612814445619e-05, "loss": 2.3624, "step": 3740500 }, { "epoch": 7.831078139129852, "grad_norm": 18.66880989074707, "learning_rate": 1.0867368557955328e-05, "loss": 2.3491, "step": 3741000 }, { "epoch": 7.832124794855478, "grad_norm": 17.184940338134766, "learning_rate": 1.0862124301465036e-05, "loss": 2.3518, "step": 3741500 }, { "epoch": 7.833171450581103, "grad_norm": 18.65412139892578, "learning_rate": 1.0856880044974745e-05, "loss": 2.3607, "step": 3742000 }, { "epoch": 7.834218106306729, "grad_norm": 14.983266830444336, "learning_rate": 1.0851635788484453e-05, "loss": 2.3566, "step": 3742500 }, { "epoch": 7.835264762032354, "grad_norm": 22.011585235595703, "learning_rate": 1.084639153199416e-05, "loss": 2.3347, "step": 3743000 }, { "epoch": 7.83631141775798, "grad_norm": 21.467571258544922, "learning_rate": 1.0841147275503868e-05, "loss": 2.3453, "step": 3743500 }, { "epoch": 7.837358073483605, "grad_norm": 18.34151268005371, "learning_rate": 1.0835903019013576e-05, "loss": 2.3647, "step": 3744000 }, { "epoch": 7.8384047292092305, "grad_norm": 15.010513305664062, "learning_rate": 1.0830658762523286e-05, "loss": 2.3685, "step": 3744500 }, { "epoch": 7.839451384934856, "grad_norm": 19.38908576965332, "learning_rate": 1.0825414506032993e-05, "loss": 2.3647, "step": 3745000 }, { "epoch": 7.840498040660481, "grad_norm": 17.762372970581055, "learning_rate": 1.0820170249542701e-05, "loss": 2.3704, "step": 3745500 }, { "epoch": 7.841544696386107, "grad_norm": 16.21873664855957, "learning_rate": 1.0814925993052409e-05, "loss": 2.3496, "step": 3746000 }, { "epoch": 7.842591352111732, "grad_norm": 20.278587341308594, "learning_rate": 1.0809681736562118e-05, "loss": 2.3677, "step": 3746500 }, { "epoch": 7.843638007837358, "grad_norm": 20.261384963989258, "learning_rate": 1.0804437480071826e-05, "loss": 2.3346, "step": 3747000 }, { "epoch": 7.844684663562983, "grad_norm": 21.232826232910156, "learning_rate": 1.0799193223581535e-05, "loss": 2.3638, "step": 3747500 }, { "epoch": 7.845731319288609, "grad_norm": 15.671854972839355, "learning_rate": 1.0793948967091243e-05, "loss": 2.3414, "step": 3748000 }, { "epoch": 7.846777975014234, "grad_norm": 22.058988571166992, "learning_rate": 1.078870471060095e-05, "loss": 2.3637, "step": 3748500 }, { "epoch": 7.84782463073986, "grad_norm": 17.825105667114258, "learning_rate": 1.0783460454110658e-05, "loss": 2.3389, "step": 3749000 }, { "epoch": 7.848871286465485, "grad_norm": 22.018531799316406, "learning_rate": 1.0778216197620366e-05, "loss": 2.3633, "step": 3749500 }, { "epoch": 7.8499179421911105, "grad_norm": 20.99919891357422, "learning_rate": 1.0772971941130075e-05, "loss": 2.336, "step": 3750000 }, { "epoch": 7.850964597916736, "grad_norm": 19.928268432617188, "learning_rate": 1.0767727684639783e-05, "loss": 2.3478, "step": 3750500 }, { "epoch": 7.852011253642362, "grad_norm": 19.093990325927734, "learning_rate": 1.0762483428149492e-05, "loss": 2.3568, "step": 3751000 }, { "epoch": 7.853057909367988, "grad_norm": 20.124380111694336, "learning_rate": 1.0757239171659198e-05, "loss": 2.3472, "step": 3751500 }, { "epoch": 7.854104565093613, "grad_norm": 19.667631149291992, "learning_rate": 1.0751994915168908e-05, "loss": 2.3357, "step": 3752000 }, { "epoch": 7.855151220819239, "grad_norm": 21.813819885253906, "learning_rate": 1.0746750658678615e-05, "loss": 2.3532, "step": 3752500 }, { "epoch": 7.856197876544864, "grad_norm": 18.21776580810547, "learning_rate": 1.0741506402188325e-05, "loss": 2.3379, "step": 3753000 }, { "epoch": 7.85724453227049, "grad_norm": 23.122852325439453, "learning_rate": 1.0736262145698032e-05, "loss": 2.3561, "step": 3753500 }, { "epoch": 7.858291187996115, "grad_norm": 20.290977478027344, "learning_rate": 1.073101788920774e-05, "loss": 2.3587, "step": 3754000 }, { "epoch": 7.8593378437217405, "grad_norm": 16.94774055480957, "learning_rate": 1.0725773632717448e-05, "loss": 2.3506, "step": 3754500 }, { "epoch": 7.860384499447366, "grad_norm": 17.812274932861328, "learning_rate": 1.0720529376227156e-05, "loss": 2.3457, "step": 3755000 }, { "epoch": 7.861431155172991, "grad_norm": 21.731218338012695, "learning_rate": 1.0715285119736865e-05, "loss": 2.3482, "step": 3755500 }, { "epoch": 7.862477810898617, "grad_norm": 18.753061294555664, "learning_rate": 1.0710040863246573e-05, "loss": 2.36, "step": 3756000 }, { "epoch": 7.863524466624242, "grad_norm": 18.794788360595703, "learning_rate": 1.0704796606756282e-05, "loss": 2.3514, "step": 3756500 }, { "epoch": 7.864571122349868, "grad_norm": 19.998546600341797, "learning_rate": 1.0699552350265988e-05, "loss": 2.3747, "step": 3757000 }, { "epoch": 7.865617778075493, "grad_norm": 19.99862289428711, "learning_rate": 1.0694308093775697e-05, "loss": 2.3408, "step": 3757500 }, { "epoch": 7.866664433801119, "grad_norm": 19.85248565673828, "learning_rate": 1.0689063837285405e-05, "loss": 2.339, "step": 3758000 }, { "epoch": 7.867711089526744, "grad_norm": 21.29397964477539, "learning_rate": 1.0683819580795114e-05, "loss": 2.3696, "step": 3758500 }, { "epoch": 7.86875774525237, "grad_norm": 17.561321258544922, "learning_rate": 1.0678575324304822e-05, "loss": 2.3605, "step": 3759000 }, { "epoch": 7.869804400977995, "grad_norm": 15.735374450683594, "learning_rate": 1.067333106781453e-05, "loss": 2.3487, "step": 3759500 }, { "epoch": 7.8708510567036205, "grad_norm": 17.144865036010742, "learning_rate": 1.0668086811324238e-05, "loss": 2.3532, "step": 3760000 }, { "epoch": 7.871897712429246, "grad_norm": 20.441871643066406, "learning_rate": 1.0662842554833947e-05, "loss": 2.3476, "step": 3760500 }, { "epoch": 7.872944368154871, "grad_norm": 17.92083168029785, "learning_rate": 1.0657598298343655e-05, "loss": 2.3554, "step": 3761000 }, { "epoch": 7.873991023880497, "grad_norm": 28.61471176147461, "learning_rate": 1.0652354041853362e-05, "loss": 2.3587, "step": 3761500 }, { "epoch": 7.875037679606122, "grad_norm": 19.337900161743164, "learning_rate": 1.0647109785363072e-05, "loss": 2.336, "step": 3762000 }, { "epoch": 7.876084335331748, "grad_norm": 19.2296085357666, "learning_rate": 1.0641865528872778e-05, "loss": 2.3539, "step": 3762500 }, { "epoch": 7.877130991057373, "grad_norm": 19.586090087890625, "learning_rate": 1.0636621272382487e-05, "loss": 2.3477, "step": 3763000 }, { "epoch": 7.878177646782999, "grad_norm": 23.514400482177734, "learning_rate": 1.0631377015892195e-05, "loss": 2.3601, "step": 3763500 }, { "epoch": 7.879224302508624, "grad_norm": 16.239276885986328, "learning_rate": 1.0626132759401904e-05, "loss": 2.346, "step": 3764000 }, { "epoch": 7.88027095823425, "grad_norm": 20.2238712310791, "learning_rate": 1.0620888502911612e-05, "loss": 2.3445, "step": 3764500 }, { "epoch": 7.881317613959875, "grad_norm": 17.71928596496582, "learning_rate": 1.0615644246421321e-05, "loss": 2.3382, "step": 3765000 }, { "epoch": 7.8823642696855005, "grad_norm": 22.872074127197266, "learning_rate": 1.0610399989931027e-05, "loss": 2.3545, "step": 3765500 }, { "epoch": 7.883410925411127, "grad_norm": 18.471595764160156, "learning_rate": 1.0605155733440737e-05, "loss": 2.3466, "step": 3766000 }, { "epoch": 7.884457581136752, "grad_norm": 17.726245880126953, "learning_rate": 1.0599911476950444e-05, "loss": 2.362, "step": 3766500 }, { "epoch": 7.885504236862378, "grad_norm": 20.675003051757812, "learning_rate": 1.0594667220460152e-05, "loss": 2.3518, "step": 3767000 }, { "epoch": 7.886550892588003, "grad_norm": 17.584684371948242, "learning_rate": 1.0589422963969861e-05, "loss": 2.359, "step": 3767500 }, { "epoch": 7.887597548313629, "grad_norm": 18.64989471435547, "learning_rate": 1.0584178707479569e-05, "loss": 2.3376, "step": 3768000 }, { "epoch": 7.888644204039254, "grad_norm": 20.329946517944336, "learning_rate": 1.0578934450989277e-05, "loss": 2.3574, "step": 3768500 }, { "epoch": 7.88969085976488, "grad_norm": 18.40101432800293, "learning_rate": 1.0573690194498984e-05, "loss": 2.3605, "step": 3769000 }, { "epoch": 7.890737515490505, "grad_norm": 20.422779083251953, "learning_rate": 1.0568445938008694e-05, "loss": 2.3524, "step": 3769500 }, { "epoch": 7.8917841712161305, "grad_norm": 15.425058364868164, "learning_rate": 1.0563201681518402e-05, "loss": 2.3457, "step": 3770000 }, { "epoch": 7.892830826941756, "grad_norm": 17.48125457763672, "learning_rate": 1.0557957425028111e-05, "loss": 2.3344, "step": 3770500 }, { "epoch": 7.893877482667381, "grad_norm": 16.15813446044922, "learning_rate": 1.0552713168537817e-05, "loss": 2.3367, "step": 3771000 }, { "epoch": 7.894924138393007, "grad_norm": 16.234600067138672, "learning_rate": 1.0547468912047526e-05, "loss": 2.3476, "step": 3771500 }, { "epoch": 7.895970794118632, "grad_norm": 18.19049072265625, "learning_rate": 1.0542224655557234e-05, "loss": 2.3576, "step": 3772000 }, { "epoch": 7.897017449844258, "grad_norm": 20.059751510620117, "learning_rate": 1.0536980399066942e-05, "loss": 2.3522, "step": 3772500 }, { "epoch": 7.898064105569883, "grad_norm": 17.449893951416016, "learning_rate": 1.0531736142576651e-05, "loss": 2.3582, "step": 3773000 }, { "epoch": 7.899110761295509, "grad_norm": 16.967464447021484, "learning_rate": 1.0526491886086359e-05, "loss": 2.3578, "step": 3773500 }, { "epoch": 7.900157417021134, "grad_norm": 19.001548767089844, "learning_rate": 1.0521247629596066e-05, "loss": 2.361, "step": 3774000 }, { "epoch": 7.90120407274676, "grad_norm": 15.654335975646973, "learning_rate": 1.0516003373105774e-05, "loss": 2.345, "step": 3774500 }, { "epoch": 7.902250728472385, "grad_norm": 23.0657901763916, "learning_rate": 1.0510759116615484e-05, "loss": 2.3429, "step": 3775000 }, { "epoch": 7.9032973841980105, "grad_norm": 21.139070510864258, "learning_rate": 1.0505514860125191e-05, "loss": 2.3473, "step": 3775500 }, { "epoch": 7.904344039923636, "grad_norm": 19.121868133544922, "learning_rate": 1.05002706036349e-05, "loss": 2.3573, "step": 3776000 }, { "epoch": 7.9053906956492614, "grad_norm": 21.7340145111084, "learning_rate": 1.0495026347144608e-05, "loss": 2.3612, "step": 3776500 }, { "epoch": 7.906437351374887, "grad_norm": 15.373214721679688, "learning_rate": 1.0489782090654316e-05, "loss": 2.3576, "step": 3777000 }, { "epoch": 7.907484007100512, "grad_norm": 17.18181610107422, "learning_rate": 1.0484537834164024e-05, "loss": 2.3564, "step": 3777500 }, { "epoch": 7.908530662826138, "grad_norm": 18.24262046813965, "learning_rate": 1.0479293577673733e-05, "loss": 2.3498, "step": 3778000 }, { "epoch": 7.909577318551763, "grad_norm": 17.857210159301758, "learning_rate": 1.047404932118344e-05, "loss": 2.332, "step": 3778500 }, { "epoch": 7.910623974277389, "grad_norm": 17.253908157348633, "learning_rate": 1.0468805064693148e-05, "loss": 2.3486, "step": 3779000 }, { "epoch": 7.911670630003014, "grad_norm": 16.129526138305664, "learning_rate": 1.0463560808202856e-05, "loss": 2.3353, "step": 3779500 }, { "epoch": 7.91271728572864, "grad_norm": 17.902070999145508, "learning_rate": 1.0458316551712564e-05, "loss": 2.3391, "step": 3780000 }, { "epoch": 7.913763941454265, "grad_norm": 17.60724639892578, "learning_rate": 1.0453072295222273e-05, "loss": 2.3417, "step": 3780500 }, { "epoch": 7.9148105971798906, "grad_norm": 14.827420234680176, "learning_rate": 1.0447828038731981e-05, "loss": 2.3491, "step": 3781000 }, { "epoch": 7.915857252905516, "grad_norm": 15.016580581665039, "learning_rate": 1.044258378224169e-05, "loss": 2.3454, "step": 3781500 }, { "epoch": 7.9169039086311415, "grad_norm": 18.679332733154297, "learning_rate": 1.0437339525751398e-05, "loss": 2.3785, "step": 3782000 }, { "epoch": 7.917950564356767, "grad_norm": 17.389144897460938, "learning_rate": 1.0432095269261106e-05, "loss": 2.3523, "step": 3782500 }, { "epoch": 7.918997220082392, "grad_norm": 18.076507568359375, "learning_rate": 1.0426851012770813e-05, "loss": 2.3563, "step": 3783000 }, { "epoch": 7.920043875808018, "grad_norm": 17.50638198852539, "learning_rate": 1.0421606756280523e-05, "loss": 2.3416, "step": 3783500 }, { "epoch": 7.921090531533643, "grad_norm": 21.576684951782227, "learning_rate": 1.041636249979023e-05, "loss": 2.3835, "step": 3784000 }, { "epoch": 7.922137187259269, "grad_norm": 15.856413841247559, "learning_rate": 1.0411118243299938e-05, "loss": 2.3321, "step": 3784500 }, { "epoch": 7.923183842984894, "grad_norm": 16.21990203857422, "learning_rate": 1.0405873986809648e-05, "loss": 2.3407, "step": 3785000 }, { "epoch": 7.92423049871052, "grad_norm": 22.009544372558594, "learning_rate": 1.0400629730319354e-05, "loss": 2.336, "step": 3785500 }, { "epoch": 7.925277154436146, "grad_norm": 23.083961486816406, "learning_rate": 1.0395385473829063e-05, "loss": 2.344, "step": 3786000 }, { "epoch": 7.9263238101617715, "grad_norm": 19.56478500366211, "learning_rate": 1.039014121733877e-05, "loss": 2.3532, "step": 3786500 }, { "epoch": 7.927370465887397, "grad_norm": 17.444150924682617, "learning_rate": 1.038489696084848e-05, "loss": 2.3508, "step": 3787000 }, { "epoch": 7.928417121613022, "grad_norm": 18.94156837463379, "learning_rate": 1.0379652704358188e-05, "loss": 2.3278, "step": 3787500 }, { "epoch": 7.929463777338648, "grad_norm": 15.978821754455566, "learning_rate": 1.0374408447867895e-05, "loss": 2.3577, "step": 3788000 }, { "epoch": 7.930510433064273, "grad_norm": 17.075090408325195, "learning_rate": 1.0369164191377603e-05, "loss": 2.3431, "step": 3788500 }, { "epoch": 7.931557088789899, "grad_norm": 26.78620719909668, "learning_rate": 1.0363919934887312e-05, "loss": 2.3438, "step": 3789000 }, { "epoch": 7.932603744515524, "grad_norm": 17.14481544494629, "learning_rate": 1.035867567839702e-05, "loss": 2.3322, "step": 3789500 }, { "epoch": 7.93365040024115, "grad_norm": 16.432865142822266, "learning_rate": 1.0353431421906728e-05, "loss": 2.3182, "step": 3790000 }, { "epoch": 7.934697055966775, "grad_norm": 19.557106018066406, "learning_rate": 1.0348187165416437e-05, "loss": 2.3437, "step": 3790500 }, { "epoch": 7.935743711692401, "grad_norm": 19.891090393066406, "learning_rate": 1.0342942908926143e-05, "loss": 2.3471, "step": 3791000 }, { "epoch": 7.936790367418026, "grad_norm": 19.5037899017334, "learning_rate": 1.0337698652435853e-05, "loss": 2.3483, "step": 3791500 }, { "epoch": 7.9378370231436515, "grad_norm": 17.539701461791992, "learning_rate": 1.033245439594556e-05, "loss": 2.3508, "step": 3792000 }, { "epoch": 7.938883678869277, "grad_norm": 19.957714080810547, "learning_rate": 1.032721013945527e-05, "loss": 2.3295, "step": 3792500 }, { "epoch": 7.939930334594902, "grad_norm": 15.545758247375488, "learning_rate": 1.0321965882964977e-05, "loss": 2.3503, "step": 3793000 }, { "epoch": 7.940976990320528, "grad_norm": 17.701948165893555, "learning_rate": 1.0316721626474687e-05, "loss": 2.356, "step": 3793500 }, { "epoch": 7.942023646046153, "grad_norm": 18.25088882446289, "learning_rate": 1.0311477369984393e-05, "loss": 2.3449, "step": 3794000 }, { "epoch": 7.943070301771779, "grad_norm": 18.757856369018555, "learning_rate": 1.0306233113494102e-05, "loss": 2.3652, "step": 3794500 }, { "epoch": 7.944116957497404, "grad_norm": 22.858863830566406, "learning_rate": 1.030098885700381e-05, "loss": 2.3551, "step": 3795000 }, { "epoch": 7.94516361322303, "grad_norm": 15.969436645507812, "learning_rate": 1.0295744600513517e-05, "loss": 2.3404, "step": 3795500 }, { "epoch": 7.946210268948655, "grad_norm": 17.415077209472656, "learning_rate": 1.0290500344023227e-05, "loss": 2.3344, "step": 3796000 }, { "epoch": 7.947256924674281, "grad_norm": 18.522724151611328, "learning_rate": 1.0285256087532935e-05, "loss": 2.348, "step": 3796500 }, { "epoch": 7.948303580399906, "grad_norm": 18.25012969970703, "learning_rate": 1.0280011831042642e-05, "loss": 2.3647, "step": 3797000 }, { "epoch": 7.9493502361255315, "grad_norm": 18.441221237182617, "learning_rate": 1.027476757455235e-05, "loss": 2.3482, "step": 3797500 }, { "epoch": 7.950396891851157, "grad_norm": 18.28406524658203, "learning_rate": 1.026952331806206e-05, "loss": 2.3443, "step": 3798000 }, { "epoch": 7.951443547576782, "grad_norm": 18.351478576660156, "learning_rate": 1.0264279061571767e-05, "loss": 2.3473, "step": 3798500 }, { "epoch": 7.952490203302408, "grad_norm": 17.98979377746582, "learning_rate": 1.0259034805081476e-05, "loss": 2.3552, "step": 3799000 }, { "epoch": 7.953536859028033, "grad_norm": 16.8449649810791, "learning_rate": 1.0253790548591182e-05, "loss": 2.3516, "step": 3799500 }, { "epoch": 7.954583514753659, "grad_norm": 19.603410720825195, "learning_rate": 1.0248546292100892e-05, "loss": 2.3365, "step": 3800000 }, { "epoch": 7.955630170479284, "grad_norm": 27.064714431762695, "learning_rate": 1.02433020356106e-05, "loss": 2.3331, "step": 3800500 }, { "epoch": 7.956676826204911, "grad_norm": 20.797176361083984, "learning_rate": 1.0238057779120309e-05, "loss": 2.3394, "step": 3801000 }, { "epoch": 7.957723481930536, "grad_norm": 16.112979888916016, "learning_rate": 1.0232813522630017e-05, "loss": 2.3467, "step": 3801500 }, { "epoch": 7.9587701376561615, "grad_norm": 17.633241653442383, "learning_rate": 1.0227569266139724e-05, "loss": 2.3811, "step": 3802000 }, { "epoch": 7.959816793381787, "grad_norm": 20.80808448791504, "learning_rate": 1.0222325009649432e-05, "loss": 2.3335, "step": 3802500 }, { "epoch": 7.960863449107412, "grad_norm": 24.47682762145996, "learning_rate": 1.021708075315914e-05, "loss": 2.3277, "step": 3803000 }, { "epoch": 7.961910104833038, "grad_norm": 17.43552017211914, "learning_rate": 1.0211836496668849e-05, "loss": 2.3537, "step": 3803500 }, { "epoch": 7.962956760558663, "grad_norm": 19.171390533447266, "learning_rate": 1.0206592240178557e-05, "loss": 2.3556, "step": 3804000 }, { "epoch": 7.964003416284289, "grad_norm": 16.884733200073242, "learning_rate": 1.0201347983688266e-05, "loss": 2.3485, "step": 3804500 }, { "epoch": 7.965050072009914, "grad_norm": 16.62090301513672, "learning_rate": 1.0196103727197972e-05, "loss": 2.3368, "step": 3805000 }, { "epoch": 7.96609672773554, "grad_norm": 18.481224060058594, "learning_rate": 1.0190859470707681e-05, "loss": 2.3497, "step": 3805500 }, { "epoch": 7.967143383461165, "grad_norm": 21.182119369506836, "learning_rate": 1.018561521421739e-05, "loss": 2.3426, "step": 3806000 }, { "epoch": 7.968190039186791, "grad_norm": 20.719274520874023, "learning_rate": 1.0180370957727099e-05, "loss": 2.3518, "step": 3806500 }, { "epoch": 7.969236694912416, "grad_norm": 21.010700225830078, "learning_rate": 1.0175126701236806e-05, "loss": 2.3527, "step": 3807000 }, { "epoch": 7.9702833506380415, "grad_norm": 18.48495864868164, "learning_rate": 1.0169882444746514e-05, "loss": 2.3439, "step": 3807500 }, { "epoch": 7.971330006363667, "grad_norm": 18.293834686279297, "learning_rate": 1.0164638188256222e-05, "loss": 2.3602, "step": 3808000 }, { "epoch": 7.972376662089292, "grad_norm": 19.844921112060547, "learning_rate": 1.015939393176593e-05, "loss": 2.3453, "step": 3808500 }, { "epoch": 7.973423317814918, "grad_norm": 17.771495819091797, "learning_rate": 1.0154149675275639e-05, "loss": 2.3453, "step": 3809000 }, { "epoch": 7.974469973540543, "grad_norm": 20.396577835083008, "learning_rate": 1.0148905418785346e-05, "loss": 2.3571, "step": 3809500 }, { "epoch": 7.975516629266169, "grad_norm": 19.06376075744629, "learning_rate": 1.0143661162295056e-05, "loss": 2.3492, "step": 3810000 }, { "epoch": 7.976563284991794, "grad_norm": 17.03286361694336, "learning_rate": 1.0138416905804763e-05, "loss": 2.3462, "step": 3810500 }, { "epoch": 7.97760994071742, "grad_norm": 19.43929100036621, "learning_rate": 1.0133172649314471e-05, "loss": 2.345, "step": 3811000 }, { "epoch": 7.978656596443045, "grad_norm": 22.10179328918457, "learning_rate": 1.0127928392824179e-05, "loss": 2.3544, "step": 3811500 }, { "epoch": 7.979703252168671, "grad_norm": 14.973992347717285, "learning_rate": 1.0122684136333888e-05, "loss": 2.3348, "step": 3812000 }, { "epoch": 7.980749907894296, "grad_norm": 20.79482650756836, "learning_rate": 1.0117439879843596e-05, "loss": 2.3361, "step": 3812500 }, { "epoch": 7.9817965636199215, "grad_norm": 17.692716598510742, "learning_rate": 1.0112195623353304e-05, "loss": 2.3454, "step": 3813000 }, { "epoch": 7.982843219345547, "grad_norm": 19.431272506713867, "learning_rate": 1.0106951366863013e-05, "loss": 2.3271, "step": 3813500 }, { "epoch": 7.983889875071172, "grad_norm": 18.547218322753906, "learning_rate": 1.0101707110372719e-05, "loss": 2.3294, "step": 3814000 }, { "epoch": 7.984936530796798, "grad_norm": 20.164588928222656, "learning_rate": 1.0096462853882428e-05, "loss": 2.3409, "step": 3814500 }, { "epoch": 7.985983186522423, "grad_norm": 17.75701141357422, "learning_rate": 1.0091218597392136e-05, "loss": 2.332, "step": 3815000 }, { "epoch": 7.987029842248049, "grad_norm": 16.928237915039062, "learning_rate": 1.0085974340901845e-05, "loss": 2.3581, "step": 3815500 }, { "epoch": 7.988076497973674, "grad_norm": 21.871837615966797, "learning_rate": 1.0080730084411553e-05, "loss": 2.3525, "step": 3816000 }, { "epoch": 7.9891231536993, "grad_norm": 18.924917221069336, "learning_rate": 1.0075485827921261e-05, "loss": 2.3458, "step": 3816500 }, { "epoch": 7.990169809424925, "grad_norm": 19.858678817749023, "learning_rate": 1.0070241571430969e-05, "loss": 2.3436, "step": 3817000 }, { "epoch": 7.991216465150551, "grad_norm": 17.28369903564453, "learning_rate": 1.0064997314940678e-05, "loss": 2.3602, "step": 3817500 }, { "epoch": 7.992263120876176, "grad_norm": 15.985541343688965, "learning_rate": 1.0059753058450386e-05, "loss": 2.3459, "step": 3818000 }, { "epoch": 7.9933097766018015, "grad_norm": 16.70755958557129, "learning_rate": 1.0054508801960093e-05, "loss": 2.3273, "step": 3818500 }, { "epoch": 7.994356432327427, "grad_norm": 18.06015396118164, "learning_rate": 1.0049264545469803e-05, "loss": 2.3494, "step": 3819000 }, { "epoch": 7.995403088053052, "grad_norm": 24.791290283203125, "learning_rate": 1.004402028897951e-05, "loss": 2.3468, "step": 3819500 }, { "epoch": 7.996449743778678, "grad_norm": 20.16731834411621, "learning_rate": 1.0038776032489218e-05, "loss": 2.3435, "step": 3820000 }, { "epoch": 7.997496399504303, "grad_norm": 17.15113067626953, "learning_rate": 1.0033531775998926e-05, "loss": 2.3473, "step": 3820500 }, { "epoch": 7.99854305522993, "grad_norm": 26.38252067565918, "learning_rate": 1.0028287519508635e-05, "loss": 2.338, "step": 3821000 }, { "epoch": 7.999589710955555, "grad_norm": 17.902620315551758, "learning_rate": 1.0023043263018343e-05, "loss": 2.3497, "step": 3821500 }, { "epoch": 8.00063636668118, "grad_norm": 15.861163139343262, "learning_rate": 1.0017799006528052e-05, "loss": 2.3479, "step": 3822000 }, { "epoch": 8.001683022406805, "grad_norm": 19.34862518310547, "learning_rate": 1.0012554750037758e-05, "loss": 2.3543, "step": 3822500 }, { "epoch": 8.00272967813243, "grad_norm": 17.832996368408203, "learning_rate": 1.0007310493547468e-05, "loss": 2.3486, "step": 3823000 }, { "epoch": 8.003776333858056, "grad_norm": 19.269691467285156, "learning_rate": 1.0002066237057175e-05, "loss": 2.3459, "step": 3823500 }, { "epoch": 8.004822989583682, "grad_norm": 15.718094825744629, "learning_rate": 9.996821980566885e-06, "loss": 2.332, "step": 3824000 }, { "epoch": 8.005869645309307, "grad_norm": 19.05845069885254, "learning_rate": 9.991577724076592e-06, "loss": 2.3498, "step": 3824500 }, { "epoch": 8.006916301034932, "grad_norm": 18.16713523864746, "learning_rate": 9.9863334675863e-06, "loss": 2.3403, "step": 3825000 }, { "epoch": 8.007962956760558, "grad_norm": 18.39226531982422, "learning_rate": 9.981089211096008e-06, "loss": 2.3287, "step": 3825500 }, { "epoch": 8.009009612486183, "grad_norm": 23.43306541442871, "learning_rate": 9.975844954605715e-06, "loss": 2.3453, "step": 3826000 }, { "epoch": 8.010056268211809, "grad_norm": 16.872314453125, "learning_rate": 9.970600698115425e-06, "loss": 2.359, "step": 3826500 }, { "epoch": 8.011102923937434, "grad_norm": 18.998647689819336, "learning_rate": 9.965356441625133e-06, "loss": 2.3465, "step": 3827000 }, { "epoch": 8.01214957966306, "grad_norm": 17.584333419799805, "learning_rate": 9.960112185134842e-06, "loss": 2.3299, "step": 3827500 }, { "epoch": 8.013196235388685, "grad_norm": 20.36935043334961, "learning_rate": 9.954867928644548e-06, "loss": 2.352, "step": 3828000 }, { "epoch": 8.014242891114312, "grad_norm": 18.317068099975586, "learning_rate": 9.949623672154257e-06, "loss": 2.328, "step": 3828500 }, { "epoch": 8.015289546839938, "grad_norm": 16.938081741333008, "learning_rate": 9.944379415663965e-06, "loss": 2.3426, "step": 3829000 }, { "epoch": 8.016336202565563, "grad_norm": 17.357027053833008, "learning_rate": 9.939135159173674e-06, "loss": 2.3504, "step": 3829500 }, { "epoch": 8.017382858291189, "grad_norm": 17.94068717956543, "learning_rate": 9.933890902683382e-06, "loss": 2.3617, "step": 3830000 }, { "epoch": 8.018429514016814, "grad_norm": 16.89752960205078, "learning_rate": 9.92864664619309e-06, "loss": 2.3438, "step": 3830500 }, { "epoch": 8.01947616974244, "grad_norm": 21.24911880493164, "learning_rate": 9.923402389702797e-06, "loss": 2.3298, "step": 3831000 }, { "epoch": 8.020522825468065, "grad_norm": 20.02886962890625, "learning_rate": 9.918158133212505e-06, "loss": 2.3414, "step": 3831500 }, { "epoch": 8.02156948119369, "grad_norm": 18.35821533203125, "learning_rate": 9.912913876722215e-06, "loss": 2.3415, "step": 3832000 }, { "epoch": 8.022616136919316, "grad_norm": 21.203508377075195, "learning_rate": 9.907669620231922e-06, "loss": 2.3296, "step": 3832500 }, { "epoch": 8.023662792644942, "grad_norm": 17.008419036865234, "learning_rate": 9.902425363741632e-06, "loss": 2.3278, "step": 3833000 }, { "epoch": 8.024709448370567, "grad_norm": 19.972835540771484, "learning_rate": 9.897181107251338e-06, "loss": 2.3478, "step": 3833500 }, { "epoch": 8.025756104096192, "grad_norm": 23.830196380615234, "learning_rate": 9.891936850761047e-06, "loss": 2.3279, "step": 3834000 }, { "epoch": 8.026802759821818, "grad_norm": 19.239776611328125, "learning_rate": 9.886692594270755e-06, "loss": 2.353, "step": 3834500 }, { "epoch": 8.027849415547443, "grad_norm": 22.078638076782227, "learning_rate": 9.881448337780464e-06, "loss": 2.3292, "step": 3835000 }, { "epoch": 8.028896071273069, "grad_norm": 19.614458084106445, "learning_rate": 9.876204081290172e-06, "loss": 2.3304, "step": 3835500 }, { "epoch": 8.029942726998694, "grad_norm": 17.79443359375, "learning_rate": 9.87095982479988e-06, "loss": 2.3314, "step": 3836000 }, { "epoch": 8.03098938272432, "grad_norm": 22.30477523803711, "learning_rate": 9.865715568309587e-06, "loss": 2.329, "step": 3836500 }, { "epoch": 8.032036038449945, "grad_norm": 23.07135009765625, "learning_rate": 9.860471311819297e-06, "loss": 2.3289, "step": 3837000 }, { "epoch": 8.03308269417557, "grad_norm": 20.80362892150879, "learning_rate": 9.855227055329004e-06, "loss": 2.3465, "step": 3837500 }, { "epoch": 8.034129349901196, "grad_norm": 18.922677993774414, "learning_rate": 9.849982798838712e-06, "loss": 2.3685, "step": 3838000 }, { "epoch": 8.035176005626822, "grad_norm": 18.504638671875, "learning_rate": 9.844738542348421e-06, "loss": 2.3222, "step": 3838500 }, { "epoch": 8.036222661352447, "grad_norm": 32.25514602661133, "learning_rate": 9.839494285858129e-06, "loss": 2.3567, "step": 3839000 }, { "epoch": 8.037269317078072, "grad_norm": 19.925798416137695, "learning_rate": 9.834250029367837e-06, "loss": 2.3342, "step": 3839500 }, { "epoch": 8.038315972803698, "grad_norm": 20.090890884399414, "learning_rate": 9.829005772877544e-06, "loss": 2.3368, "step": 3840000 }, { "epoch": 8.039362628529323, "grad_norm": 23.239885330200195, "learning_rate": 9.823761516387254e-06, "loss": 2.3557, "step": 3840500 }, { "epoch": 8.040409284254949, "grad_norm": 18.731599807739258, "learning_rate": 9.818517259896961e-06, "loss": 2.3302, "step": 3841000 }, { "epoch": 8.041455939980574, "grad_norm": 20.209501266479492, "learning_rate": 9.81327300340667e-06, "loss": 2.3332, "step": 3841500 }, { "epoch": 8.0425025957062, "grad_norm": 16.05323028564453, "learning_rate": 9.808028746916377e-06, "loss": 2.3295, "step": 3842000 }, { "epoch": 8.043549251431825, "grad_norm": 19.425989151000977, "learning_rate": 9.802784490426086e-06, "loss": 2.3307, "step": 3842500 }, { "epoch": 8.04459590715745, "grad_norm": 18.894775390625, "learning_rate": 9.797540233935794e-06, "loss": 2.3473, "step": 3843000 }, { "epoch": 8.045642562883076, "grad_norm": 16.353111267089844, "learning_rate": 9.792295977445502e-06, "loss": 2.2999, "step": 3843500 }, { "epoch": 8.046689218608702, "grad_norm": 23.48505210876465, "learning_rate": 9.787051720955211e-06, "loss": 2.341, "step": 3844000 }, { "epoch": 8.047735874334327, "grad_norm": 18.24711036682129, "learning_rate": 9.781807464464919e-06, "loss": 2.3466, "step": 3844500 }, { "epoch": 8.048782530059952, "grad_norm": 19.259384155273438, "learning_rate": 9.776563207974626e-06, "loss": 2.3553, "step": 3845000 }, { "epoch": 8.049829185785578, "grad_norm": 16.4998722076416, "learning_rate": 9.771318951484334e-06, "loss": 2.3589, "step": 3845500 }, { "epoch": 8.050875841511203, "grad_norm": 16.308683395385742, "learning_rate": 9.766074694994043e-06, "loss": 2.3497, "step": 3846000 }, { "epoch": 8.051922497236829, "grad_norm": 17.935245513916016, "learning_rate": 9.760830438503751e-06, "loss": 2.3396, "step": 3846500 }, { "epoch": 8.052969152962454, "grad_norm": 17.623584747314453, "learning_rate": 9.75558618201346e-06, "loss": 2.3426, "step": 3847000 }, { "epoch": 8.05401580868808, "grad_norm": 14.793787956237793, "learning_rate": 9.750341925523168e-06, "loss": 2.3353, "step": 3847500 }, { "epoch": 8.055062464413705, "grad_norm": 16.985260009765625, "learning_rate": 9.745097669032876e-06, "loss": 2.3514, "step": 3848000 }, { "epoch": 8.05610912013933, "grad_norm": 21.698328018188477, "learning_rate": 9.739853412542584e-06, "loss": 2.35, "step": 3848500 }, { "epoch": 8.057155775864956, "grad_norm": 21.083528518676758, "learning_rate": 9.734609156052291e-06, "loss": 2.3345, "step": 3849000 }, { "epoch": 8.058202431590582, "grad_norm": 17.726482391357422, "learning_rate": 9.729364899562e-06, "loss": 2.3454, "step": 3849500 }, { "epoch": 8.059249087316207, "grad_norm": 20.990161895751953, "learning_rate": 9.724120643071708e-06, "loss": 2.3414, "step": 3850000 }, { "epoch": 8.060295743041832, "grad_norm": 17.29098892211914, "learning_rate": 9.718876386581416e-06, "loss": 2.3462, "step": 3850500 }, { "epoch": 8.061342398767458, "grad_norm": 20.02425193786621, "learning_rate": 9.713632130091124e-06, "loss": 2.3258, "step": 3851000 }, { "epoch": 8.062389054493083, "grad_norm": 18.071786880493164, "learning_rate": 9.708387873600833e-06, "loss": 2.3499, "step": 3851500 }, { "epoch": 8.063435710218709, "grad_norm": 22.374813079833984, "learning_rate": 9.70314361711054e-06, "loss": 2.3418, "step": 3852000 }, { "epoch": 8.064482365944334, "grad_norm": 20.376102447509766, "learning_rate": 9.69789936062025e-06, "loss": 2.3481, "step": 3852500 }, { "epoch": 8.06552902166996, "grad_norm": 17.843795776367188, "learning_rate": 9.692655104129958e-06, "loss": 2.3469, "step": 3853000 }, { "epoch": 8.066575677395585, "grad_norm": 18.753673553466797, "learning_rate": 9.687410847639666e-06, "loss": 2.34, "step": 3853500 }, { "epoch": 8.06762233312121, "grad_norm": 19.395612716674805, "learning_rate": 9.682166591149373e-06, "loss": 2.3516, "step": 3854000 }, { "epoch": 8.068668988846836, "grad_norm": 15.387019157409668, "learning_rate": 9.676922334659081e-06, "loss": 2.3399, "step": 3854500 }, { "epoch": 8.069715644572462, "grad_norm": 17.801454544067383, "learning_rate": 9.67167807816879e-06, "loss": 2.3408, "step": 3855000 }, { "epoch": 8.070762300298087, "grad_norm": 17.322227478027344, "learning_rate": 9.666433821678498e-06, "loss": 2.3508, "step": 3855500 }, { "epoch": 8.071808956023713, "grad_norm": 16.036392211914062, "learning_rate": 9.661189565188207e-06, "loss": 2.3394, "step": 3856000 }, { "epoch": 8.072855611749338, "grad_norm": 21.774227142333984, "learning_rate": 9.655945308697913e-06, "loss": 2.3132, "step": 3856500 }, { "epoch": 8.073902267474963, "grad_norm": 19.856264114379883, "learning_rate": 9.650701052207623e-06, "loss": 2.3316, "step": 3857000 }, { "epoch": 8.074948923200589, "grad_norm": 18.131845474243164, "learning_rate": 9.64545679571733e-06, "loss": 2.3364, "step": 3857500 }, { "epoch": 8.075995578926214, "grad_norm": 20.310487747192383, "learning_rate": 9.64021253922704e-06, "loss": 2.3498, "step": 3858000 }, { "epoch": 8.07704223465184, "grad_norm": 17.783252716064453, "learning_rate": 9.634968282736748e-06, "loss": 2.331, "step": 3858500 }, { "epoch": 8.078088890377465, "grad_norm": 17.99469757080078, "learning_rate": 9.629724026246455e-06, "loss": 2.3174, "step": 3859000 }, { "epoch": 8.07913554610309, "grad_norm": 18.308740615844727, "learning_rate": 9.624479769756163e-06, "loss": 2.3492, "step": 3859500 }, { "epoch": 8.080182201828716, "grad_norm": 20.734289169311523, "learning_rate": 9.619235513265872e-06, "loss": 2.3222, "step": 3860000 }, { "epoch": 8.081228857554342, "grad_norm": 19.335346221923828, "learning_rate": 9.61399125677558e-06, "loss": 2.3208, "step": 3860500 }, { "epoch": 8.082275513279967, "grad_norm": 21.025699615478516, "learning_rate": 9.608747000285288e-06, "loss": 2.3376, "step": 3861000 }, { "epoch": 8.083322169005593, "grad_norm": 18.456586837768555, "learning_rate": 9.603502743794997e-06, "loss": 2.3389, "step": 3861500 }, { "epoch": 8.084368824731218, "grad_norm": 18.289098739624023, "learning_rate": 9.598258487304703e-06, "loss": 2.3258, "step": 3862000 }, { "epoch": 8.085415480456843, "grad_norm": 17.0471248626709, "learning_rate": 9.593014230814412e-06, "loss": 2.3447, "step": 3862500 }, { "epoch": 8.086462136182469, "grad_norm": 20.81525421142578, "learning_rate": 9.58776997432412e-06, "loss": 2.3213, "step": 3863000 }, { "epoch": 8.087508791908096, "grad_norm": 15.387685775756836, "learning_rate": 9.58252571783383e-06, "loss": 2.3312, "step": 3863500 }, { "epoch": 8.088555447633722, "grad_norm": 15.029489517211914, "learning_rate": 9.577281461343537e-06, "loss": 2.3339, "step": 3864000 }, { "epoch": 8.089602103359347, "grad_norm": 17.281450271606445, "learning_rate": 9.572037204853247e-06, "loss": 2.3432, "step": 3864500 }, { "epoch": 8.090648759084972, "grad_norm": 21.500324249267578, "learning_rate": 9.566792948362953e-06, "loss": 2.331, "step": 3865000 }, { "epoch": 8.091695414810598, "grad_norm": 15.90667724609375, "learning_rate": 9.561548691872662e-06, "loss": 2.3359, "step": 3865500 }, { "epoch": 8.092742070536223, "grad_norm": 18.388917922973633, "learning_rate": 9.55630443538237e-06, "loss": 2.3414, "step": 3866000 }, { "epoch": 8.093788726261849, "grad_norm": 20.079004287719727, "learning_rate": 9.551060178892077e-06, "loss": 2.3246, "step": 3866500 }, { "epoch": 8.094835381987474, "grad_norm": 17.99538230895996, "learning_rate": 9.545815922401787e-06, "loss": 2.333, "step": 3867000 }, { "epoch": 8.0958820377131, "grad_norm": 17.60931968688965, "learning_rate": 9.540571665911493e-06, "loss": 2.3291, "step": 3867500 }, { "epoch": 8.096928693438725, "grad_norm": 19.268030166625977, "learning_rate": 9.535327409421202e-06, "loss": 2.3332, "step": 3868000 }, { "epoch": 8.09797534916435, "grad_norm": 22.93962860107422, "learning_rate": 9.53008315293091e-06, "loss": 2.3628, "step": 3868500 }, { "epoch": 8.099022004889976, "grad_norm": 17.036911010742188, "learning_rate": 9.52483889644062e-06, "loss": 2.3316, "step": 3869000 }, { "epoch": 8.100068660615602, "grad_norm": 23.24159049987793, "learning_rate": 9.519594639950327e-06, "loss": 2.3442, "step": 3869500 }, { "epoch": 8.101115316341227, "grad_norm": 19.81153106689453, "learning_rate": 9.514350383460036e-06, "loss": 2.3515, "step": 3870000 }, { "epoch": 8.102161972066853, "grad_norm": 18.480939865112305, "learning_rate": 9.509106126969742e-06, "loss": 2.3187, "step": 3870500 }, { "epoch": 8.103208627792478, "grad_norm": 24.250717163085938, "learning_rate": 9.503861870479452e-06, "loss": 2.3267, "step": 3871000 }, { "epoch": 8.104255283518103, "grad_norm": 19.304277420043945, "learning_rate": 9.49861761398916e-06, "loss": 2.34, "step": 3871500 }, { "epoch": 8.105301939243729, "grad_norm": 18.37351417541504, "learning_rate": 9.493373357498867e-06, "loss": 2.3139, "step": 3872000 }, { "epoch": 8.106348594969354, "grad_norm": 19.834930419921875, "learning_rate": 9.488129101008576e-06, "loss": 2.3375, "step": 3872500 }, { "epoch": 8.10739525069498, "grad_norm": 17.23051643371582, "learning_rate": 9.482884844518284e-06, "loss": 2.3448, "step": 3873000 }, { "epoch": 8.108441906420605, "grad_norm": 18.360973358154297, "learning_rate": 9.477640588027992e-06, "loss": 2.3399, "step": 3873500 }, { "epoch": 8.10948856214623, "grad_norm": 20.143644332885742, "learning_rate": 9.4723963315377e-06, "loss": 2.3354, "step": 3874000 }, { "epoch": 8.110535217871856, "grad_norm": 32.281517028808594, "learning_rate": 9.467152075047409e-06, "loss": 2.3175, "step": 3874500 }, { "epoch": 8.111581873597482, "grad_norm": 19.81734848022461, "learning_rate": 9.461907818557117e-06, "loss": 2.3237, "step": 3875000 }, { "epoch": 8.112628529323107, "grad_norm": 18.39508056640625, "learning_rate": 9.456663562066826e-06, "loss": 2.3433, "step": 3875500 }, { "epoch": 8.113675185048733, "grad_norm": 20.17505645751953, "learning_rate": 9.451419305576532e-06, "loss": 2.3302, "step": 3876000 }, { "epoch": 8.114721840774358, "grad_norm": 19.249563217163086, "learning_rate": 9.446175049086241e-06, "loss": 2.3219, "step": 3876500 }, { "epoch": 8.115768496499983, "grad_norm": 25.769031524658203, "learning_rate": 9.440930792595949e-06, "loss": 2.3408, "step": 3877000 }, { "epoch": 8.116815152225609, "grad_norm": 18.10018539428711, "learning_rate": 9.435686536105657e-06, "loss": 2.3325, "step": 3877500 }, { "epoch": 8.117861807951234, "grad_norm": 15.516273498535156, "learning_rate": 9.430442279615366e-06, "loss": 2.3387, "step": 3878000 }, { "epoch": 8.11890846367686, "grad_norm": 16.804655075073242, "learning_rate": 9.425198023125074e-06, "loss": 2.314, "step": 3878500 }, { "epoch": 8.119955119402485, "grad_norm": 19.320646286010742, "learning_rate": 9.419953766634782e-06, "loss": 2.3493, "step": 3879000 }, { "epoch": 8.12100177512811, "grad_norm": 17.085891723632812, "learning_rate": 9.41470951014449e-06, "loss": 2.3413, "step": 3879500 }, { "epoch": 8.122048430853736, "grad_norm": 16.77912712097168, "learning_rate": 9.409465253654199e-06, "loss": 2.3461, "step": 3880000 }, { "epoch": 8.123095086579362, "grad_norm": 22.47195816040039, "learning_rate": 9.404220997163906e-06, "loss": 2.3377, "step": 3880500 }, { "epoch": 8.124141742304987, "grad_norm": 18.95868682861328, "learning_rate": 9.398976740673616e-06, "loss": 2.3317, "step": 3881000 }, { "epoch": 8.125188398030613, "grad_norm": 22.219257354736328, "learning_rate": 9.393732484183323e-06, "loss": 2.3188, "step": 3881500 }, { "epoch": 8.126235053756238, "grad_norm": 18.397064208984375, "learning_rate": 9.388488227693031e-06, "loss": 2.3544, "step": 3882000 }, { "epoch": 8.127281709481863, "grad_norm": 18.275718688964844, "learning_rate": 9.383243971202739e-06, "loss": 2.3216, "step": 3882500 }, { "epoch": 8.128328365207489, "grad_norm": 20.609159469604492, "learning_rate": 9.377999714712448e-06, "loss": 2.3487, "step": 3883000 }, { "epoch": 8.129375020933114, "grad_norm": 18.794607162475586, "learning_rate": 9.372755458222156e-06, "loss": 2.3506, "step": 3883500 }, { "epoch": 8.13042167665874, "grad_norm": 18.526002883911133, "learning_rate": 9.367511201731864e-06, "loss": 2.3286, "step": 3884000 }, { "epoch": 8.131468332384365, "grad_norm": 17.767627716064453, "learning_rate": 9.362266945241571e-06, "loss": 2.3385, "step": 3884500 }, { "epoch": 8.13251498810999, "grad_norm": 17.30839729309082, "learning_rate": 9.357022688751279e-06, "loss": 2.3332, "step": 3885000 }, { "epoch": 8.133561643835616, "grad_norm": 25.498125076293945, "learning_rate": 9.351778432260988e-06, "loss": 2.3385, "step": 3885500 }, { "epoch": 8.134608299561242, "grad_norm": 16.783607482910156, "learning_rate": 9.346534175770696e-06, "loss": 2.3398, "step": 3886000 }, { "epoch": 8.135654955286867, "grad_norm": 16.553617477416992, "learning_rate": 9.341289919280405e-06, "loss": 2.3505, "step": 3886500 }, { "epoch": 8.136701611012493, "grad_norm": 22.591569900512695, "learning_rate": 9.336045662790113e-06, "loss": 2.3568, "step": 3887000 }, { "epoch": 8.137748266738118, "grad_norm": 21.258007049560547, "learning_rate": 9.33080140629982e-06, "loss": 2.3463, "step": 3887500 }, { "epoch": 8.138794922463743, "grad_norm": 23.314788818359375, "learning_rate": 9.325557149809528e-06, "loss": 2.3216, "step": 3888000 }, { "epoch": 8.139841578189369, "grad_norm": 15.65504264831543, "learning_rate": 9.320312893319238e-06, "loss": 2.3252, "step": 3888500 }, { "epoch": 8.140888233914994, "grad_norm": 17.702823638916016, "learning_rate": 9.315068636828946e-06, "loss": 2.3239, "step": 3889000 }, { "epoch": 8.14193488964062, "grad_norm": 20.12299346923828, "learning_rate": 9.309824380338653e-06, "loss": 2.3229, "step": 3889500 }, { "epoch": 8.142981545366245, "grad_norm": 21.42717170715332, "learning_rate": 9.304580123848363e-06, "loss": 2.333, "step": 3890000 }, { "epoch": 8.14402820109187, "grad_norm": 17.016977310180664, "learning_rate": 9.299335867358069e-06, "loss": 2.3286, "step": 3890500 }, { "epoch": 8.145074856817496, "grad_norm": 16.777873992919922, "learning_rate": 9.294091610867778e-06, "loss": 2.3544, "step": 3891000 }, { "epoch": 8.146121512543122, "grad_norm": 20.869522094726562, "learning_rate": 9.288847354377486e-06, "loss": 2.3414, "step": 3891500 }, { "epoch": 8.147168168268747, "grad_norm": 18.326440811157227, "learning_rate": 9.283603097887195e-06, "loss": 2.3121, "step": 3892000 }, { "epoch": 8.148214823994373, "grad_norm": 18.019681930541992, "learning_rate": 9.278358841396903e-06, "loss": 2.3209, "step": 3892500 }, { "epoch": 8.149261479719998, "grad_norm": 16.582353591918945, "learning_rate": 9.27311458490661e-06, "loss": 2.3382, "step": 3893000 }, { "epoch": 8.150308135445623, "grad_norm": 21.837133407592773, "learning_rate": 9.267870328416318e-06, "loss": 2.3346, "step": 3893500 }, { "epoch": 8.151354791171249, "grad_norm": 19.038604736328125, "learning_rate": 9.262626071926028e-06, "loss": 2.3477, "step": 3894000 }, { "epoch": 8.152401446896874, "grad_norm": 20.158187866210938, "learning_rate": 9.257381815435735e-06, "loss": 2.3171, "step": 3894500 }, { "epoch": 8.1534481026225, "grad_norm": 21.331188201904297, "learning_rate": 9.252137558945443e-06, "loss": 2.3134, "step": 3895000 }, { "epoch": 8.154494758348125, "grad_norm": 16.851165771484375, "learning_rate": 9.246893302455152e-06, "loss": 2.3634, "step": 3895500 }, { "epoch": 8.15554141407375, "grad_norm": 20.34442138671875, "learning_rate": 9.24164904596486e-06, "loss": 2.3299, "step": 3896000 }, { "epoch": 8.156588069799376, "grad_norm": 18.808542251586914, "learning_rate": 9.236404789474568e-06, "loss": 2.3248, "step": 3896500 }, { "epoch": 8.157634725525002, "grad_norm": 18.7241153717041, "learning_rate": 9.231160532984275e-06, "loss": 2.3409, "step": 3897000 }, { "epoch": 8.158681381250627, "grad_norm": 15.280763626098633, "learning_rate": 9.225916276493985e-06, "loss": 2.3303, "step": 3897500 }, { "epoch": 8.159728036976254, "grad_norm": 16.80512237548828, "learning_rate": 9.220672020003692e-06, "loss": 2.3251, "step": 3898000 }, { "epoch": 8.16077469270188, "grad_norm": 18.362335205078125, "learning_rate": 9.215427763513402e-06, "loss": 2.3448, "step": 3898500 }, { "epoch": 8.161821348427505, "grad_norm": 17.170875549316406, "learning_rate": 9.210183507023108e-06, "loss": 2.3255, "step": 3899000 }, { "epoch": 8.16286800415313, "grad_norm": 18.563596725463867, "learning_rate": 9.204939250532817e-06, "loss": 2.3601, "step": 3899500 }, { "epoch": 8.163914659878756, "grad_norm": 17.874753952026367, "learning_rate": 9.199694994042525e-06, "loss": 2.3383, "step": 3900000 }, { "epoch": 8.164961315604382, "grad_norm": 17.512989044189453, "learning_rate": 9.194450737552234e-06, "loss": 2.3338, "step": 3900500 }, { "epoch": 8.166007971330007, "grad_norm": 15.814064025878906, "learning_rate": 9.189206481061942e-06, "loss": 2.3432, "step": 3901000 }, { "epoch": 8.167054627055633, "grad_norm": 18.832765579223633, "learning_rate": 9.18396222457165e-06, "loss": 2.3468, "step": 3901500 }, { "epoch": 8.168101282781258, "grad_norm": 18.325685501098633, "learning_rate": 9.178717968081357e-06, "loss": 2.312, "step": 3902000 }, { "epoch": 8.169147938506883, "grad_norm": 17.286916732788086, "learning_rate": 9.173473711591065e-06, "loss": 2.3254, "step": 3902500 }, { "epoch": 8.170194594232509, "grad_norm": 17.91053581237793, "learning_rate": 9.168229455100774e-06, "loss": 2.329, "step": 3903000 }, { "epoch": 8.171241249958134, "grad_norm": 19.372785568237305, "learning_rate": 9.162985198610482e-06, "loss": 2.3513, "step": 3903500 }, { "epoch": 8.17228790568376, "grad_norm": 19.668212890625, "learning_rate": 9.157740942120192e-06, "loss": 2.3304, "step": 3904000 }, { "epoch": 8.173334561409385, "grad_norm": 16.445877075195312, "learning_rate": 9.152496685629898e-06, "loss": 2.3268, "step": 3904500 }, { "epoch": 8.17438121713501, "grad_norm": 22.215208053588867, "learning_rate": 9.147252429139607e-06, "loss": 2.3353, "step": 3905000 }, { "epoch": 8.175427872860636, "grad_norm": 20.245952606201172, "learning_rate": 9.142008172649315e-06, "loss": 2.3272, "step": 3905500 }, { "epoch": 8.176474528586262, "grad_norm": 18.969970703125, "learning_rate": 9.136763916159024e-06, "loss": 2.3236, "step": 3906000 }, { "epoch": 8.177521184311887, "grad_norm": 19.818592071533203, "learning_rate": 9.131519659668732e-06, "loss": 2.3376, "step": 3906500 }, { "epoch": 8.178567840037513, "grad_norm": 17.179441452026367, "learning_rate": 9.12627540317844e-06, "loss": 2.327, "step": 3907000 }, { "epoch": 8.179614495763138, "grad_norm": 28.978090286254883, "learning_rate": 9.121031146688147e-06, "loss": 2.322, "step": 3907500 }, { "epoch": 8.180661151488763, "grad_norm": 22.452848434448242, "learning_rate": 9.115786890197855e-06, "loss": 2.3443, "step": 3908000 }, { "epoch": 8.181707807214389, "grad_norm": 18.786741256713867, "learning_rate": 9.110542633707564e-06, "loss": 2.323, "step": 3908500 }, { "epoch": 8.182754462940014, "grad_norm": 20.533674240112305, "learning_rate": 9.105298377217272e-06, "loss": 2.3217, "step": 3909000 }, { "epoch": 8.18380111866564, "grad_norm": 19.30573272705078, "learning_rate": 9.100054120726981e-06, "loss": 2.3287, "step": 3909500 }, { "epoch": 8.184847774391265, "grad_norm": 17.112560272216797, "learning_rate": 9.094809864236687e-06, "loss": 2.3235, "step": 3910000 }, { "epoch": 8.18589443011689, "grad_norm": 17.961458206176758, "learning_rate": 9.089565607746397e-06, "loss": 2.32, "step": 3910500 }, { "epoch": 8.186941085842516, "grad_norm": 18.658466339111328, "learning_rate": 9.084321351256104e-06, "loss": 2.3431, "step": 3911000 }, { "epoch": 8.187987741568142, "grad_norm": 17.382747650146484, "learning_rate": 9.079077094765814e-06, "loss": 2.345, "step": 3911500 }, { "epoch": 8.189034397293767, "grad_norm": 14.565335273742676, "learning_rate": 9.073832838275521e-06, "loss": 2.3495, "step": 3912000 }, { "epoch": 8.190081053019393, "grad_norm": 17.595542907714844, "learning_rate": 9.068588581785229e-06, "loss": 2.3231, "step": 3912500 }, { "epoch": 8.191127708745018, "grad_norm": 20.707185745239258, "learning_rate": 9.063344325294937e-06, "loss": 2.3209, "step": 3913000 }, { "epoch": 8.192174364470644, "grad_norm": 15.944828987121582, "learning_rate": 9.058100068804644e-06, "loss": 2.3192, "step": 3913500 }, { "epoch": 8.193221020196269, "grad_norm": 20.596420288085938, "learning_rate": 9.052855812314354e-06, "loss": 2.3283, "step": 3914000 }, { "epoch": 8.194267675921894, "grad_norm": 19.238882064819336, "learning_rate": 9.047611555824061e-06, "loss": 2.3218, "step": 3914500 }, { "epoch": 8.19531433164752, "grad_norm": 18.090757369995117, "learning_rate": 9.042367299333771e-06, "loss": 2.324, "step": 3915000 }, { "epoch": 8.196360987373145, "grad_norm": 16.606348037719727, "learning_rate": 9.037123042843479e-06, "loss": 2.3344, "step": 3915500 }, { "epoch": 8.19740764309877, "grad_norm": 22.414316177368164, "learning_rate": 9.031878786353186e-06, "loss": 2.3353, "step": 3916000 }, { "epoch": 8.198454298824396, "grad_norm": 19.55415153503418, "learning_rate": 9.026634529862894e-06, "loss": 2.3161, "step": 3916500 }, { "epoch": 8.199500954550022, "grad_norm": 17.02579116821289, "learning_rate": 9.021390273372603e-06, "loss": 2.3605, "step": 3917000 }, { "epoch": 8.200547610275647, "grad_norm": 18.68869400024414, "learning_rate": 9.016146016882311e-06, "loss": 2.3209, "step": 3917500 }, { "epoch": 8.201594266001273, "grad_norm": 16.41204833984375, "learning_rate": 9.010901760392019e-06, "loss": 2.3262, "step": 3918000 }, { "epoch": 8.202640921726898, "grad_norm": 18.754688262939453, "learning_rate": 9.005657503901726e-06, "loss": 2.3429, "step": 3918500 }, { "epoch": 8.203687577452524, "grad_norm": 20.88689422607422, "learning_rate": 9.000413247411436e-06, "loss": 2.3399, "step": 3919000 }, { "epoch": 8.204734233178149, "grad_norm": 20.824779510498047, "learning_rate": 8.995168990921143e-06, "loss": 2.3502, "step": 3919500 }, { "epoch": 8.205780888903774, "grad_norm": 19.417964935302734, "learning_rate": 8.989924734430851e-06, "loss": 2.3246, "step": 3920000 }, { "epoch": 8.2068275446294, "grad_norm": 19.172958374023438, "learning_rate": 8.98468047794056e-06, "loss": 2.3222, "step": 3920500 }, { "epoch": 8.207874200355025, "grad_norm": 19.721384048461914, "learning_rate": 8.979436221450268e-06, "loss": 2.3176, "step": 3921000 }, { "epoch": 8.20892085608065, "grad_norm": 17.41860008239746, "learning_rate": 8.974191964959976e-06, "loss": 2.3407, "step": 3921500 }, { "epoch": 8.209967511806276, "grad_norm": 19.534446716308594, "learning_rate": 8.968947708469684e-06, "loss": 2.3274, "step": 3922000 }, { "epoch": 8.211014167531902, "grad_norm": 16.340068817138672, "learning_rate": 8.963703451979393e-06, "loss": 2.3438, "step": 3922500 }, { "epoch": 8.212060823257527, "grad_norm": 23.530336380004883, "learning_rate": 8.9584591954891e-06, "loss": 2.3286, "step": 3923000 }, { "epoch": 8.213107478983153, "grad_norm": 18.651914596557617, "learning_rate": 8.95321493899881e-06, "loss": 2.3337, "step": 3923500 }, { "epoch": 8.214154134708778, "grad_norm": 21.579975128173828, "learning_rate": 8.947970682508518e-06, "loss": 2.3427, "step": 3924000 }, { "epoch": 8.215200790434404, "grad_norm": 21.42156982421875, "learning_rate": 8.942726426018225e-06, "loss": 2.2973, "step": 3924500 }, { "epoch": 8.216247446160029, "grad_norm": 17.14889144897461, "learning_rate": 8.937482169527933e-06, "loss": 2.3231, "step": 3925000 }, { "epoch": 8.217294101885654, "grad_norm": 16.902490615844727, "learning_rate": 8.932237913037641e-06, "loss": 2.3252, "step": 3925500 }, { "epoch": 8.21834075761128, "grad_norm": 18.097299575805664, "learning_rate": 8.92699365654735e-06, "loss": 2.3507, "step": 3926000 }, { "epoch": 8.219387413336905, "grad_norm": 17.104639053344727, "learning_rate": 8.921749400057058e-06, "loss": 2.3354, "step": 3926500 }, { "epoch": 8.22043406906253, "grad_norm": 17.531269073486328, "learning_rate": 8.916505143566766e-06, "loss": 2.327, "step": 3927000 }, { "epoch": 8.221480724788156, "grad_norm": 18.868803024291992, "learning_rate": 8.911260887076473e-06, "loss": 2.331, "step": 3927500 }, { "epoch": 8.222527380513782, "grad_norm": 17.878299713134766, "learning_rate": 8.906016630586183e-06, "loss": 2.3309, "step": 3928000 }, { "epoch": 8.223574036239407, "grad_norm": 18.022096633911133, "learning_rate": 8.90077237409589e-06, "loss": 2.3266, "step": 3928500 }, { "epoch": 8.224620691965033, "grad_norm": 19.395336151123047, "learning_rate": 8.8955281176056e-06, "loss": 2.3324, "step": 3929000 }, { "epoch": 8.225667347690658, "grad_norm": 19.021514892578125, "learning_rate": 8.890283861115307e-06, "loss": 2.3318, "step": 3929500 }, { "epoch": 8.226714003416284, "grad_norm": 20.46791648864746, "learning_rate": 8.885039604625015e-06, "loss": 2.3374, "step": 3930000 }, { "epoch": 8.227760659141909, "grad_norm": 16.998830795288086, "learning_rate": 8.879795348134723e-06, "loss": 2.341, "step": 3930500 }, { "epoch": 8.228807314867534, "grad_norm": 22.054767608642578, "learning_rate": 8.87455109164443e-06, "loss": 2.305, "step": 3931000 }, { "epoch": 8.22985397059316, "grad_norm": 15.234109878540039, "learning_rate": 8.86930683515414e-06, "loss": 2.3207, "step": 3931500 }, { "epoch": 8.230900626318785, "grad_norm": 17.498414993286133, "learning_rate": 8.864062578663848e-06, "loss": 2.3292, "step": 3932000 }, { "epoch": 8.23194728204441, "grad_norm": 24.854938507080078, "learning_rate": 8.858818322173557e-06, "loss": 2.3306, "step": 3932500 }, { "epoch": 8.232993937770036, "grad_norm": 18.978199005126953, "learning_rate": 8.853574065683263e-06, "loss": 2.3369, "step": 3933000 }, { "epoch": 8.234040593495664, "grad_norm": 19.789464950561523, "learning_rate": 8.848329809192972e-06, "loss": 2.3163, "step": 3933500 }, { "epoch": 8.235087249221289, "grad_norm": 19.169662475585938, "learning_rate": 8.84308555270268e-06, "loss": 2.3381, "step": 3934000 }, { "epoch": 8.236133904946914, "grad_norm": 18.95401382446289, "learning_rate": 8.83784129621239e-06, "loss": 2.324, "step": 3934500 }, { "epoch": 8.23718056067254, "grad_norm": 22.49205780029297, "learning_rate": 8.832597039722097e-06, "loss": 2.3376, "step": 3935000 }, { "epoch": 8.238227216398165, "grad_norm": 19.792434692382812, "learning_rate": 8.827352783231805e-06, "loss": 2.3267, "step": 3935500 }, { "epoch": 8.23927387212379, "grad_norm": 16.679603576660156, "learning_rate": 8.822108526741513e-06, "loss": 2.3226, "step": 3936000 }, { "epoch": 8.240320527849416, "grad_norm": 18.882123947143555, "learning_rate": 8.81686427025122e-06, "loss": 2.3247, "step": 3936500 }, { "epoch": 8.241367183575042, "grad_norm": 20.220199584960938, "learning_rate": 8.81162001376093e-06, "loss": 2.3229, "step": 3937000 }, { "epoch": 8.242413839300667, "grad_norm": 20.294227600097656, "learning_rate": 8.806375757270637e-06, "loss": 2.3381, "step": 3937500 }, { "epoch": 8.243460495026293, "grad_norm": 21.1709041595459, "learning_rate": 8.801131500780347e-06, "loss": 2.3267, "step": 3938000 }, { "epoch": 8.244507150751918, "grad_norm": 23.409109115600586, "learning_rate": 8.795887244290053e-06, "loss": 2.3441, "step": 3938500 }, { "epoch": 8.245553806477544, "grad_norm": 21.191165924072266, "learning_rate": 8.790642987799762e-06, "loss": 2.3263, "step": 3939000 }, { "epoch": 8.246600462203169, "grad_norm": 20.87224578857422, "learning_rate": 8.78539873130947e-06, "loss": 2.3257, "step": 3939500 }, { "epoch": 8.247647117928794, "grad_norm": 17.878896713256836, "learning_rate": 8.78015447481918e-06, "loss": 2.3317, "step": 3940000 }, { "epoch": 8.24869377365442, "grad_norm": 20.361892700195312, "learning_rate": 8.774910218328887e-06, "loss": 2.3467, "step": 3940500 }, { "epoch": 8.249740429380045, "grad_norm": 20.43202018737793, "learning_rate": 8.769665961838596e-06, "loss": 2.3266, "step": 3941000 }, { "epoch": 8.25078708510567, "grad_norm": 18.9981689453125, "learning_rate": 8.764421705348302e-06, "loss": 2.3437, "step": 3941500 }, { "epoch": 8.251833740831296, "grad_norm": 20.301725387573242, "learning_rate": 8.759177448858012e-06, "loss": 2.3339, "step": 3942000 }, { "epoch": 8.252880396556922, "grad_norm": 19.593149185180664, "learning_rate": 8.75393319236772e-06, "loss": 2.3354, "step": 3942500 }, { "epoch": 8.253927052282547, "grad_norm": 15.391679763793945, "learning_rate": 8.748688935877427e-06, "loss": 2.3564, "step": 3943000 }, { "epoch": 8.254973708008173, "grad_norm": 21.754053115844727, "learning_rate": 8.743444679387136e-06, "loss": 2.3432, "step": 3943500 }, { "epoch": 8.256020363733798, "grad_norm": 22.848878860473633, "learning_rate": 8.738200422896844e-06, "loss": 2.3194, "step": 3944000 }, { "epoch": 8.257067019459424, "grad_norm": 18.753808975219727, "learning_rate": 8.732956166406552e-06, "loss": 2.3331, "step": 3944500 }, { "epoch": 8.258113675185049, "grad_norm": 20.771522521972656, "learning_rate": 8.72771190991626e-06, "loss": 2.3155, "step": 3945000 }, { "epoch": 8.259160330910674, "grad_norm": 19.829387664794922, "learning_rate": 8.722467653425969e-06, "loss": 2.3145, "step": 3945500 }, { "epoch": 8.2602069866363, "grad_norm": 17.345420837402344, "learning_rate": 8.717223396935677e-06, "loss": 2.323, "step": 3946000 }, { "epoch": 8.261253642361925, "grad_norm": 17.876386642456055, "learning_rate": 8.711979140445386e-06, "loss": 2.3262, "step": 3946500 }, { "epoch": 8.26230029808755, "grad_norm": 21.56599998474121, "learning_rate": 8.706734883955092e-06, "loss": 2.3148, "step": 3947000 }, { "epoch": 8.263346953813176, "grad_norm": 14.959748268127441, "learning_rate": 8.701490627464801e-06, "loss": 2.3357, "step": 3947500 }, { "epoch": 8.264393609538802, "grad_norm": 18.783832550048828, "learning_rate": 8.696246370974509e-06, "loss": 2.3084, "step": 3948000 }, { "epoch": 8.265440265264427, "grad_norm": 16.418807983398438, "learning_rate": 8.691002114484217e-06, "loss": 2.3207, "step": 3948500 }, { "epoch": 8.266486920990053, "grad_norm": 21.433584213256836, "learning_rate": 8.685757857993926e-06, "loss": 2.3112, "step": 3949000 }, { "epoch": 8.267533576715678, "grad_norm": 19.7863826751709, "learning_rate": 8.680513601503634e-06, "loss": 2.3184, "step": 3949500 }, { "epoch": 8.268580232441304, "grad_norm": 17.86429786682129, "learning_rate": 8.675269345013341e-06, "loss": 2.3415, "step": 3950000 }, { "epoch": 8.269626888166929, "grad_norm": 17.648637771606445, "learning_rate": 8.670025088523049e-06, "loss": 2.3348, "step": 3950500 }, { "epoch": 8.270673543892554, "grad_norm": 20.348876953125, "learning_rate": 8.664780832032759e-06, "loss": 2.3242, "step": 3951000 }, { "epoch": 8.27172019961818, "grad_norm": 24.376323699951172, "learning_rate": 8.659536575542466e-06, "loss": 2.3233, "step": 3951500 }, { "epoch": 8.272766855343805, "grad_norm": 20.3951473236084, "learning_rate": 8.654292319052176e-06, "loss": 2.3214, "step": 3952000 }, { "epoch": 8.27381351106943, "grad_norm": 16.098491668701172, "learning_rate": 8.649048062561883e-06, "loss": 2.3275, "step": 3952500 }, { "epoch": 8.274860166795056, "grad_norm": 21.588010787963867, "learning_rate": 8.643803806071591e-06, "loss": 2.3222, "step": 3953000 }, { "epoch": 8.275906822520682, "grad_norm": 18.466169357299805, "learning_rate": 8.638559549581299e-06, "loss": 2.3384, "step": 3953500 }, { "epoch": 8.276953478246307, "grad_norm": 20.162050247192383, "learning_rate": 8.633315293091006e-06, "loss": 2.3341, "step": 3954000 }, { "epoch": 8.278000133971933, "grad_norm": 17.535261154174805, "learning_rate": 8.628071036600716e-06, "loss": 2.3335, "step": 3954500 }, { "epoch": 8.279046789697558, "grad_norm": 20.815839767456055, "learning_rate": 8.622826780110423e-06, "loss": 2.3416, "step": 3955000 }, { "epoch": 8.280093445423184, "grad_norm": 17.183889389038086, "learning_rate": 8.617582523620131e-06, "loss": 2.3323, "step": 3955500 }, { "epoch": 8.281140101148809, "grad_norm": 32.85243225097656, "learning_rate": 8.612338267129839e-06, "loss": 2.3274, "step": 3956000 }, { "epoch": 8.282186756874435, "grad_norm": 19.596927642822266, "learning_rate": 8.607094010639548e-06, "loss": 2.3114, "step": 3956500 }, { "epoch": 8.28323341260006, "grad_norm": 18.22878074645996, "learning_rate": 8.601849754149256e-06, "loss": 2.3316, "step": 3957000 }, { "epoch": 8.284280068325685, "grad_norm": 19.92812156677246, "learning_rate": 8.596605497658965e-06, "loss": 2.3326, "step": 3957500 }, { "epoch": 8.28532672405131, "grad_norm": 20.84733009338379, "learning_rate": 8.591361241168673e-06, "loss": 2.3235, "step": 3958000 }, { "epoch": 8.286373379776936, "grad_norm": 15.574783325195312, "learning_rate": 8.58611698467838e-06, "loss": 2.3579, "step": 3958500 }, { "epoch": 8.287420035502562, "grad_norm": 18.43537712097168, "learning_rate": 8.580872728188088e-06, "loss": 2.3315, "step": 3959000 }, { "epoch": 8.288466691228187, "grad_norm": 18.75430679321289, "learning_rate": 8.575628471697798e-06, "loss": 2.3427, "step": 3959500 }, { "epoch": 8.289513346953813, "grad_norm": 17.495616912841797, "learning_rate": 8.570384215207505e-06, "loss": 2.3376, "step": 3960000 }, { "epoch": 8.290560002679438, "grad_norm": 19.18819808959961, "learning_rate": 8.565139958717213e-06, "loss": 2.3372, "step": 3960500 }, { "epoch": 8.291606658405064, "grad_norm": 20.687894821166992, "learning_rate": 8.559895702226923e-06, "loss": 2.3328, "step": 3961000 }, { "epoch": 8.292653314130689, "grad_norm": 22.297456741333008, "learning_rate": 8.554651445736629e-06, "loss": 2.3398, "step": 3961500 }, { "epoch": 8.293699969856315, "grad_norm": 16.2756404876709, "learning_rate": 8.549407189246338e-06, "loss": 2.3308, "step": 3962000 }, { "epoch": 8.29474662558194, "grad_norm": 15.989151954650879, "learning_rate": 8.544162932756046e-06, "loss": 2.3336, "step": 3962500 }, { "epoch": 8.295793281307565, "grad_norm": 17.8392391204834, "learning_rate": 8.538918676265755e-06, "loss": 2.34, "step": 3963000 }, { "epoch": 8.296839937033191, "grad_norm": 20.2867374420166, "learning_rate": 8.533674419775463e-06, "loss": 2.3385, "step": 3963500 }, { "epoch": 8.297886592758816, "grad_norm": 18.12394905090332, "learning_rate": 8.52843016328517e-06, "loss": 2.3316, "step": 3964000 }, { "epoch": 8.298933248484442, "grad_norm": 18.835865020751953, "learning_rate": 8.523185906794878e-06, "loss": 2.3095, "step": 3964500 }, { "epoch": 8.299979904210067, "grad_norm": 20.24064064025879, "learning_rate": 8.517941650304587e-06, "loss": 2.3163, "step": 3965000 }, { "epoch": 8.301026559935693, "grad_norm": 19.72666358947754, "learning_rate": 8.512697393814295e-06, "loss": 2.3106, "step": 3965500 }, { "epoch": 8.302073215661318, "grad_norm": 21.043556213378906, "learning_rate": 8.507453137324003e-06, "loss": 2.3159, "step": 3966000 }, { "epoch": 8.303119871386944, "grad_norm": 21.35428810119629, "learning_rate": 8.502208880833712e-06, "loss": 2.3451, "step": 3966500 }, { "epoch": 8.304166527112569, "grad_norm": 15.104958534240723, "learning_rate": 8.496964624343418e-06, "loss": 2.317, "step": 3967000 }, { "epoch": 8.305213182838195, "grad_norm": 21.926132202148438, "learning_rate": 8.491720367853128e-06, "loss": 2.3399, "step": 3967500 }, { "epoch": 8.306259838563822, "grad_norm": 17.63503646850586, "learning_rate": 8.486476111362835e-06, "loss": 2.317, "step": 3968000 }, { "epoch": 8.307306494289445, "grad_norm": 19.546974182128906, "learning_rate": 8.481231854872545e-06, "loss": 2.31, "step": 3968500 }, { "epoch": 8.308353150015073, "grad_norm": 16.764328002929688, "learning_rate": 8.475987598382252e-06, "loss": 2.3373, "step": 3969000 }, { "epoch": 8.309399805740698, "grad_norm": 22.85115623474121, "learning_rate": 8.470743341891962e-06, "loss": 2.3281, "step": 3969500 }, { "epoch": 8.310446461466324, "grad_norm": 25.9664363861084, "learning_rate": 8.465499085401668e-06, "loss": 2.3469, "step": 3970000 }, { "epoch": 8.311493117191949, "grad_norm": 18.22427749633789, "learning_rate": 8.460254828911377e-06, "loss": 2.3277, "step": 3970500 }, { "epoch": 8.312539772917575, "grad_norm": 18.391380310058594, "learning_rate": 8.455010572421085e-06, "loss": 2.3362, "step": 3971000 }, { "epoch": 8.3135864286432, "grad_norm": 16.5615177154541, "learning_rate": 8.449766315930793e-06, "loss": 2.339, "step": 3971500 }, { "epoch": 8.314633084368825, "grad_norm": 15.2998685836792, "learning_rate": 8.444522059440502e-06, "loss": 2.331, "step": 3972000 }, { "epoch": 8.31567974009445, "grad_norm": 17.942054748535156, "learning_rate": 8.439277802950208e-06, "loss": 2.3105, "step": 3972500 }, { "epoch": 8.316726395820076, "grad_norm": 18.53024673461914, "learning_rate": 8.434033546459917e-06, "loss": 2.3288, "step": 3973000 }, { "epoch": 8.317773051545702, "grad_norm": 17.234617233276367, "learning_rate": 8.428789289969625e-06, "loss": 2.3299, "step": 3973500 }, { "epoch": 8.318819707271327, "grad_norm": 18.35879898071289, "learning_rate": 8.423545033479334e-06, "loss": 2.3347, "step": 3974000 }, { "epoch": 8.319866362996953, "grad_norm": 17.0389404296875, "learning_rate": 8.418300776989042e-06, "loss": 2.3207, "step": 3974500 }, { "epoch": 8.320913018722578, "grad_norm": 17.275249481201172, "learning_rate": 8.413056520498751e-06, "loss": 2.3308, "step": 3975000 }, { "epoch": 8.321959674448204, "grad_norm": 17.783828735351562, "learning_rate": 8.407812264008457e-06, "loss": 2.3206, "step": 3975500 }, { "epoch": 8.323006330173829, "grad_norm": 17.420909881591797, "learning_rate": 8.402568007518167e-06, "loss": 2.3145, "step": 3976000 }, { "epoch": 8.324052985899455, "grad_norm": 19.357412338256836, "learning_rate": 8.397323751027875e-06, "loss": 2.3287, "step": 3976500 }, { "epoch": 8.32509964162508, "grad_norm": 20.09633445739746, "learning_rate": 8.392079494537582e-06, "loss": 2.3418, "step": 3977000 }, { "epoch": 8.326146297350705, "grad_norm": 28.996417999267578, "learning_rate": 8.386835238047292e-06, "loss": 2.3364, "step": 3977500 }, { "epoch": 8.327192953076331, "grad_norm": 19.16389274597168, "learning_rate": 8.381590981557e-06, "loss": 2.3471, "step": 3978000 }, { "epoch": 8.328239608801956, "grad_norm": 17.07559585571289, "learning_rate": 8.376346725066707e-06, "loss": 2.3416, "step": 3978500 }, { "epoch": 8.329286264527582, "grad_norm": 18.532737731933594, "learning_rate": 8.371102468576415e-06, "loss": 2.3223, "step": 3979000 }, { "epoch": 8.330332920253207, "grad_norm": 22.672147750854492, "learning_rate": 8.365858212086124e-06, "loss": 2.3244, "step": 3979500 }, { "epoch": 8.331379575978833, "grad_norm": 20.347843170166016, "learning_rate": 8.360613955595832e-06, "loss": 2.3242, "step": 3980000 }, { "epoch": 8.332426231704458, "grad_norm": 19.006351470947266, "learning_rate": 8.355369699105541e-06, "loss": 2.3168, "step": 3980500 }, { "epoch": 8.333472887430084, "grad_norm": 19.46746063232422, "learning_rate": 8.350125442615247e-06, "loss": 2.328, "step": 3981000 }, { "epoch": 8.334519543155709, "grad_norm": 18.91793441772461, "learning_rate": 8.344881186124956e-06, "loss": 2.3458, "step": 3981500 }, { "epoch": 8.335566198881335, "grad_norm": 18.812335968017578, "learning_rate": 8.339636929634664e-06, "loss": 2.3157, "step": 3982000 }, { "epoch": 8.33661285460696, "grad_norm": 22.445384979248047, "learning_rate": 8.334392673144374e-06, "loss": 2.3189, "step": 3982500 }, { "epoch": 8.337659510332585, "grad_norm": 20.02627182006836, "learning_rate": 8.329148416654081e-06, "loss": 2.324, "step": 3983000 }, { "epoch": 8.338706166058211, "grad_norm": 20.916685104370117, "learning_rate": 8.323904160163789e-06, "loss": 2.3411, "step": 3983500 }, { "epoch": 8.339752821783836, "grad_norm": 20.43821907043457, "learning_rate": 8.318659903673497e-06, "loss": 2.3373, "step": 3984000 }, { "epoch": 8.340799477509462, "grad_norm": 20.041963577270508, "learning_rate": 8.313415647183204e-06, "loss": 2.3425, "step": 3984500 }, { "epoch": 8.341846133235087, "grad_norm": 21.556930541992188, "learning_rate": 8.308171390692914e-06, "loss": 2.3236, "step": 3985000 }, { "epoch": 8.342892788960713, "grad_norm": 16.875675201416016, "learning_rate": 8.302927134202621e-06, "loss": 2.3463, "step": 3985500 }, { "epoch": 8.343939444686338, "grad_norm": 20.699567794799805, "learning_rate": 8.29768287771233e-06, "loss": 2.3283, "step": 3986000 }, { "epoch": 8.344986100411964, "grad_norm": 17.79894256591797, "learning_rate": 8.292438621222038e-06, "loss": 2.3185, "step": 3986500 }, { "epoch": 8.346032756137589, "grad_norm": 15.60158920288086, "learning_rate": 8.287194364731746e-06, "loss": 2.3405, "step": 3987000 }, { "epoch": 8.347079411863215, "grad_norm": 21.741207122802734, "learning_rate": 8.281950108241454e-06, "loss": 2.3241, "step": 3987500 }, { "epoch": 8.34812606758884, "grad_norm": 18.38046646118164, "learning_rate": 8.276705851751163e-06, "loss": 2.3304, "step": 3988000 }, { "epoch": 8.349172723314465, "grad_norm": 19.564512252807617, "learning_rate": 8.271461595260871e-06, "loss": 2.323, "step": 3988500 }, { "epoch": 8.350219379040091, "grad_norm": 23.00644302368164, "learning_rate": 8.266217338770579e-06, "loss": 2.3221, "step": 3989000 }, { "epoch": 8.351266034765716, "grad_norm": 19.55278778076172, "learning_rate": 8.260973082280286e-06, "loss": 2.3388, "step": 3989500 }, { "epoch": 8.352312690491342, "grad_norm": 15.872673988342285, "learning_rate": 8.255728825789994e-06, "loss": 2.3223, "step": 3990000 }, { "epoch": 8.353359346216967, "grad_norm": 20.7786808013916, "learning_rate": 8.250484569299703e-06, "loss": 2.3371, "step": 3990500 }, { "epoch": 8.354406001942593, "grad_norm": 19.747089385986328, "learning_rate": 8.245240312809411e-06, "loss": 2.3265, "step": 3991000 }, { "epoch": 8.355452657668218, "grad_norm": 16.506113052368164, "learning_rate": 8.23999605631912e-06, "loss": 2.3285, "step": 3991500 }, { "epoch": 8.356499313393844, "grad_norm": 18.135950088500977, "learning_rate": 8.234751799828828e-06, "loss": 2.3336, "step": 3992000 }, { "epoch": 8.35754596911947, "grad_norm": 19.192041397094727, "learning_rate": 8.229507543338536e-06, "loss": 2.3339, "step": 3992500 }, { "epoch": 8.358592624845095, "grad_norm": 18.7352294921875, "learning_rate": 8.224263286848244e-06, "loss": 2.3253, "step": 3993000 }, { "epoch": 8.35963928057072, "grad_norm": 22.39306640625, "learning_rate": 8.219019030357953e-06, "loss": 2.32, "step": 3993500 }, { "epoch": 8.360685936296345, "grad_norm": 17.22637367248535, "learning_rate": 8.21377477386766e-06, "loss": 2.3198, "step": 3994000 }, { "epoch": 8.361732592021971, "grad_norm": 20.357439041137695, "learning_rate": 8.208530517377368e-06, "loss": 2.3381, "step": 3994500 }, { "epoch": 8.362779247747596, "grad_norm": 19.226837158203125, "learning_rate": 8.203286260887078e-06, "loss": 2.3366, "step": 3995000 }, { "epoch": 8.363825903473222, "grad_norm": 20.79778289794922, "learning_rate": 8.198042004396784e-06, "loss": 2.3315, "step": 3995500 }, { "epoch": 8.364872559198847, "grad_norm": 17.620729446411133, "learning_rate": 8.192797747906493e-06, "loss": 2.3232, "step": 3996000 }, { "epoch": 8.365919214924473, "grad_norm": 18.593669891357422, "learning_rate": 8.1875534914162e-06, "loss": 2.3115, "step": 3996500 }, { "epoch": 8.366965870650098, "grad_norm": 32.03464126586914, "learning_rate": 8.18230923492591e-06, "loss": 2.3225, "step": 3997000 }, { "epoch": 8.368012526375724, "grad_norm": 19.236652374267578, "learning_rate": 8.177064978435618e-06, "loss": 2.3334, "step": 3997500 }, { "epoch": 8.36905918210135, "grad_norm": 16.602458953857422, "learning_rate": 8.171820721945326e-06, "loss": 2.3212, "step": 3998000 }, { "epoch": 8.370105837826975, "grad_norm": 20.449350357055664, "learning_rate": 8.166576465455033e-06, "loss": 2.3263, "step": 3998500 }, { "epoch": 8.3711524935526, "grad_norm": 19.283348083496094, "learning_rate": 8.161332208964743e-06, "loss": 2.3306, "step": 3999000 }, { "epoch": 8.372199149278226, "grad_norm": 20.677549362182617, "learning_rate": 8.15608795247445e-06, "loss": 2.3206, "step": 3999500 }, { "epoch": 8.373245805003851, "grad_norm": 18.013954162597656, "learning_rate": 8.15084369598416e-06, "loss": 2.3248, "step": 4000000 }, { "epoch": 8.374292460729476, "grad_norm": 18.466075897216797, "learning_rate": 8.145599439493867e-06, "loss": 2.3323, "step": 4000500 }, { "epoch": 8.375339116455102, "grad_norm": 20.93260955810547, "learning_rate": 8.140355183003575e-06, "loss": 2.3262, "step": 4001000 }, { "epoch": 8.376385772180727, "grad_norm": 17.013805389404297, "learning_rate": 8.135110926513283e-06, "loss": 2.3361, "step": 4001500 }, { "epoch": 8.377432427906353, "grad_norm": 19.864831924438477, "learning_rate": 8.12986667002299e-06, "loss": 2.3286, "step": 4002000 }, { "epoch": 8.378479083631978, "grad_norm": 17.164613723754883, "learning_rate": 8.1246224135327e-06, "loss": 2.3315, "step": 4002500 }, { "epoch": 8.379525739357604, "grad_norm": 19.01630210876465, "learning_rate": 8.119378157042408e-06, "loss": 2.3124, "step": 4003000 }, { "epoch": 8.380572395083231, "grad_norm": 29.995054244995117, "learning_rate": 8.114133900552117e-06, "loss": 2.3236, "step": 4003500 }, { "epoch": 8.381619050808856, "grad_norm": 34.48688888549805, "learning_rate": 8.108889644061823e-06, "loss": 2.3265, "step": 4004000 }, { "epoch": 8.382665706534482, "grad_norm": 19.29151153564453, "learning_rate": 8.103645387571532e-06, "loss": 2.3239, "step": 4004500 }, { "epoch": 8.383712362260107, "grad_norm": 20.19856071472168, "learning_rate": 8.09840113108124e-06, "loss": 2.318, "step": 4005000 }, { "epoch": 8.384759017985733, "grad_norm": 22.500539779663086, "learning_rate": 8.09315687459095e-06, "loss": 2.3353, "step": 4005500 }, { "epoch": 8.385805673711358, "grad_norm": 17.241350173950195, "learning_rate": 8.087912618100657e-06, "loss": 2.307, "step": 4006000 }, { "epoch": 8.386852329436984, "grad_norm": 16.76925277709961, "learning_rate": 8.082668361610365e-06, "loss": 2.3227, "step": 4006500 }, { "epoch": 8.38789898516261, "grad_norm": 17.45073890686035, "learning_rate": 8.077424105120072e-06, "loss": 2.32, "step": 4007000 }, { "epoch": 8.388945640888235, "grad_norm": 18.303199768066406, "learning_rate": 8.07217984862978e-06, "loss": 2.3287, "step": 4007500 }, { "epoch": 8.38999229661386, "grad_norm": 22.73206329345703, "learning_rate": 8.06693559213949e-06, "loss": 2.3328, "step": 4008000 }, { "epoch": 8.391038952339485, "grad_norm": 17.777917861938477, "learning_rate": 8.061691335649197e-06, "loss": 2.3321, "step": 4008500 }, { "epoch": 8.392085608065111, "grad_norm": 18.80194091796875, "learning_rate": 8.056447079158907e-06, "loss": 2.3178, "step": 4009000 }, { "epoch": 8.393132263790736, "grad_norm": 20.371341705322266, "learning_rate": 8.051202822668613e-06, "loss": 2.333, "step": 4009500 }, { "epoch": 8.394178919516362, "grad_norm": 18.984725952148438, "learning_rate": 8.045958566178322e-06, "loss": 2.341, "step": 4010000 }, { "epoch": 8.395225575241987, "grad_norm": 18.30462074279785, "learning_rate": 8.04071430968803e-06, "loss": 2.3286, "step": 4010500 }, { "epoch": 8.396272230967613, "grad_norm": 18.687040328979492, "learning_rate": 8.035470053197739e-06, "loss": 2.3196, "step": 4011000 }, { "epoch": 8.397318886693238, "grad_norm": 26.631149291992188, "learning_rate": 8.030225796707447e-06, "loss": 2.3419, "step": 4011500 }, { "epoch": 8.398365542418864, "grad_norm": 15.061326026916504, "learning_rate": 8.024981540217154e-06, "loss": 2.33, "step": 4012000 }, { "epoch": 8.39941219814449, "grad_norm": 18.16072654724121, "learning_rate": 8.019737283726862e-06, "loss": 2.3102, "step": 4012500 }, { "epoch": 8.400458853870115, "grad_norm": 18.312541961669922, "learning_rate": 8.01449302723657e-06, "loss": 2.3187, "step": 4013000 }, { "epoch": 8.40150550959574, "grad_norm": 17.01963996887207, "learning_rate": 8.00924877074628e-06, "loss": 2.3084, "step": 4013500 }, { "epoch": 8.402552165321366, "grad_norm": 23.24225616455078, "learning_rate": 8.004004514255987e-06, "loss": 2.3345, "step": 4014000 }, { "epoch": 8.403598821046991, "grad_norm": 19.270431518554688, "learning_rate": 7.998760257765696e-06, "loss": 2.3119, "step": 4014500 }, { "epoch": 8.404645476772616, "grad_norm": 19.161731719970703, "learning_rate": 7.993516001275402e-06, "loss": 2.3253, "step": 4015000 }, { "epoch": 8.405692132498242, "grad_norm": 16.978361129760742, "learning_rate": 7.988271744785112e-06, "loss": 2.337, "step": 4015500 }, { "epoch": 8.406738788223867, "grad_norm": 20.6834659576416, "learning_rate": 7.98302748829482e-06, "loss": 2.3056, "step": 4016000 }, { "epoch": 8.407785443949493, "grad_norm": 21.131864547729492, "learning_rate": 7.977783231804529e-06, "loss": 2.3217, "step": 4016500 }, { "epoch": 8.408832099675118, "grad_norm": 20.46363639831543, "learning_rate": 7.972538975314236e-06, "loss": 2.3213, "step": 4017000 }, { "epoch": 8.409878755400744, "grad_norm": 18.290517807006836, "learning_rate": 7.967294718823944e-06, "loss": 2.3119, "step": 4017500 }, { "epoch": 8.41092541112637, "grad_norm": 20.10504913330078, "learning_rate": 7.962050462333652e-06, "loss": 2.343, "step": 4018000 }, { "epoch": 8.411972066851995, "grad_norm": 17.725482940673828, "learning_rate": 7.956806205843361e-06, "loss": 2.3411, "step": 4018500 }, { "epoch": 8.41301872257762, "grad_norm": 21.113988876342773, "learning_rate": 7.951561949353069e-06, "loss": 2.3397, "step": 4019000 }, { "epoch": 8.414065378303246, "grad_norm": 16.45224952697754, "learning_rate": 7.946317692862777e-06, "loss": 2.3255, "step": 4019500 }, { "epoch": 8.415112034028871, "grad_norm": 18.60120964050293, "learning_rate": 7.941073436372486e-06, "loss": 2.3315, "step": 4020000 }, { "epoch": 8.416158689754496, "grad_norm": 17.549705505371094, "learning_rate": 7.935829179882194e-06, "loss": 2.3223, "step": 4020500 }, { "epoch": 8.417205345480122, "grad_norm": 16.4660701751709, "learning_rate": 7.930584923391901e-06, "loss": 2.2985, "step": 4021000 }, { "epoch": 8.418252001205747, "grad_norm": 25.402320861816406, "learning_rate": 7.925340666901609e-06, "loss": 2.3344, "step": 4021500 }, { "epoch": 8.419298656931373, "grad_norm": 19.157323837280273, "learning_rate": 7.920096410411318e-06, "loss": 2.3059, "step": 4022000 }, { "epoch": 8.420345312656998, "grad_norm": 28.211576461791992, "learning_rate": 7.914852153921026e-06, "loss": 2.3139, "step": 4022500 }, { "epoch": 8.421391968382624, "grad_norm": 21.537776947021484, "learning_rate": 7.909607897430736e-06, "loss": 2.32, "step": 4023000 }, { "epoch": 8.42243862410825, "grad_norm": 21.094863891601562, "learning_rate": 7.904363640940442e-06, "loss": 2.3199, "step": 4023500 }, { "epoch": 8.423485279833875, "grad_norm": 19.11260223388672, "learning_rate": 7.899119384450151e-06, "loss": 2.3264, "step": 4024000 }, { "epoch": 8.4245319355595, "grad_norm": 21.772815704345703, "learning_rate": 7.893875127959859e-06, "loss": 2.3213, "step": 4024500 }, { "epoch": 8.425578591285126, "grad_norm": 25.711763381958008, "learning_rate": 7.888630871469566e-06, "loss": 2.3219, "step": 4025000 }, { "epoch": 8.426625247010751, "grad_norm": 20.111679077148438, "learning_rate": 7.883386614979276e-06, "loss": 2.3012, "step": 4025500 }, { "epoch": 8.427671902736376, "grad_norm": 24.172626495361328, "learning_rate": 7.878142358488983e-06, "loss": 2.3054, "step": 4026000 }, { "epoch": 8.428718558462002, "grad_norm": 18.306453704833984, "learning_rate": 7.872898101998691e-06, "loss": 2.3283, "step": 4026500 }, { "epoch": 8.429765214187627, "grad_norm": 17.61922264099121, "learning_rate": 7.867653845508399e-06, "loss": 2.3192, "step": 4027000 }, { "epoch": 8.430811869913253, "grad_norm": 17.07660675048828, "learning_rate": 7.862409589018108e-06, "loss": 2.2933, "step": 4027500 }, { "epoch": 8.431858525638878, "grad_norm": 15.648914337158203, "learning_rate": 7.857165332527816e-06, "loss": 2.3184, "step": 4028000 }, { "epoch": 8.432905181364504, "grad_norm": 19.24228858947754, "learning_rate": 7.851921076037525e-06, "loss": 2.3115, "step": 4028500 }, { "epoch": 8.43395183709013, "grad_norm": 20.870975494384766, "learning_rate": 7.846676819547233e-06, "loss": 2.3293, "step": 4029000 }, { "epoch": 8.434998492815755, "grad_norm": 21.00062370300293, "learning_rate": 7.84143256305694e-06, "loss": 2.3277, "step": 4029500 }, { "epoch": 8.43604514854138, "grad_norm": 17.03116798400879, "learning_rate": 7.836188306566648e-06, "loss": 2.3095, "step": 4030000 }, { "epoch": 8.437091804267006, "grad_norm": 21.74484634399414, "learning_rate": 7.830944050076356e-06, "loss": 2.3102, "step": 4030500 }, { "epoch": 8.438138459992631, "grad_norm": 21.311660766601562, "learning_rate": 7.825699793586065e-06, "loss": 2.3277, "step": 4031000 }, { "epoch": 8.439185115718256, "grad_norm": 22.29060173034668, "learning_rate": 7.820455537095773e-06, "loss": 2.318, "step": 4031500 }, { "epoch": 8.440231771443882, "grad_norm": 17.57488250732422, "learning_rate": 7.81521128060548e-06, "loss": 2.3128, "step": 4032000 }, { "epoch": 8.441278427169507, "grad_norm": 18.22705841064453, "learning_rate": 7.809967024115188e-06, "loss": 2.3424, "step": 4032500 }, { "epoch": 8.442325082895133, "grad_norm": 22.696895599365234, "learning_rate": 7.804722767624898e-06, "loss": 2.3352, "step": 4033000 }, { "epoch": 8.443371738620758, "grad_norm": 18.906410217285156, "learning_rate": 7.799478511134606e-06, "loss": 2.3267, "step": 4033500 }, { "epoch": 8.444418394346384, "grad_norm": 21.388427734375, "learning_rate": 7.794234254644315e-06, "loss": 2.321, "step": 4034000 }, { "epoch": 8.44546505007201, "grad_norm": 19.259790420532227, "learning_rate": 7.788989998154023e-06, "loss": 2.304, "step": 4034500 }, { "epoch": 8.446511705797635, "grad_norm": 19.407451629638672, "learning_rate": 7.78374574166373e-06, "loss": 2.3163, "step": 4035000 }, { "epoch": 8.44755836152326, "grad_norm": 19.03086280822754, "learning_rate": 7.778501485173438e-06, "loss": 2.2961, "step": 4035500 }, { "epoch": 8.448605017248886, "grad_norm": 17.205190658569336, "learning_rate": 7.773257228683146e-06, "loss": 2.3133, "step": 4036000 }, { "epoch": 8.449651672974511, "grad_norm": 15.949979782104492, "learning_rate": 7.768012972192855e-06, "loss": 2.3059, "step": 4036500 }, { "epoch": 8.450698328700136, "grad_norm": 15.74425220489502, "learning_rate": 7.762768715702563e-06, "loss": 2.3272, "step": 4037000 }, { "epoch": 8.451744984425762, "grad_norm": 17.076045989990234, "learning_rate": 7.757524459212272e-06, "loss": 2.3267, "step": 4037500 }, { "epoch": 8.45279164015139, "grad_norm": 25.497385025024414, "learning_rate": 7.752280202721978e-06, "loss": 2.3192, "step": 4038000 }, { "epoch": 8.453838295877013, "grad_norm": 18.535127639770508, "learning_rate": 7.747035946231688e-06, "loss": 2.3215, "step": 4038500 }, { "epoch": 8.45488495160264, "grad_norm": 16.61117172241211, "learning_rate": 7.741791689741395e-06, "loss": 2.2966, "step": 4039000 }, { "epoch": 8.455931607328266, "grad_norm": 18.014551162719727, "learning_rate": 7.736547433251105e-06, "loss": 2.3247, "step": 4039500 }, { "epoch": 8.456978263053891, "grad_norm": 17.108375549316406, "learning_rate": 7.731303176760812e-06, "loss": 2.3241, "step": 4040000 }, { "epoch": 8.458024918779516, "grad_norm": 19.404111862182617, "learning_rate": 7.72605892027052e-06, "loss": 2.3072, "step": 4040500 }, { "epoch": 8.459071574505142, "grad_norm": 20.257715225219727, "learning_rate": 7.720814663780228e-06, "loss": 2.3225, "step": 4041000 }, { "epoch": 8.460118230230767, "grad_norm": 18.371822357177734, "learning_rate": 7.715570407289937e-06, "loss": 2.3317, "step": 4041500 }, { "epoch": 8.461164885956393, "grad_norm": 19.236160278320312, "learning_rate": 7.710326150799645e-06, "loss": 2.331, "step": 4042000 }, { "epoch": 8.462211541682018, "grad_norm": 18.47724723815918, "learning_rate": 7.705081894309352e-06, "loss": 2.3239, "step": 4042500 }, { "epoch": 8.463258197407644, "grad_norm": 20.859272003173828, "learning_rate": 7.699837637819062e-06, "loss": 2.3359, "step": 4043000 }, { "epoch": 8.46430485313327, "grad_norm": 22.20708656311035, "learning_rate": 7.694593381328768e-06, "loss": 2.3132, "step": 4043500 }, { "epoch": 8.465351508858895, "grad_norm": 18.662635803222656, "learning_rate": 7.689349124838477e-06, "loss": 2.3365, "step": 4044000 }, { "epoch": 8.46639816458452, "grad_norm": 20.977645874023438, "learning_rate": 7.684104868348185e-06, "loss": 2.3209, "step": 4044500 }, { "epoch": 8.467444820310146, "grad_norm": 24.422836303710938, "learning_rate": 7.678860611857894e-06, "loss": 2.3219, "step": 4045000 }, { "epoch": 8.468491476035771, "grad_norm": 20.51860809326172, "learning_rate": 7.673616355367602e-06, "loss": 2.3156, "step": 4045500 }, { "epoch": 8.469538131761396, "grad_norm": 17.88079261779785, "learning_rate": 7.668372098877311e-06, "loss": 2.3235, "step": 4046000 }, { "epoch": 8.470584787487022, "grad_norm": 16.97974967956543, "learning_rate": 7.663127842387017e-06, "loss": 2.2983, "step": 4046500 }, { "epoch": 8.471631443212647, "grad_norm": 19.690763473510742, "learning_rate": 7.657883585896727e-06, "loss": 2.3042, "step": 4047000 }, { "epoch": 8.472678098938273, "grad_norm": 22.73923110961914, "learning_rate": 7.652639329406434e-06, "loss": 2.3243, "step": 4047500 }, { "epoch": 8.473724754663898, "grad_norm": 18.432329177856445, "learning_rate": 7.647395072916142e-06, "loss": 2.3111, "step": 4048000 }, { "epoch": 8.474771410389524, "grad_norm": 18.60053825378418, "learning_rate": 7.642150816425851e-06, "loss": 2.3081, "step": 4048500 }, { "epoch": 8.47581806611515, "grad_norm": 20.483976364135742, "learning_rate": 7.636906559935557e-06, "loss": 2.322, "step": 4049000 }, { "epoch": 8.476864721840775, "grad_norm": 18.21925163269043, "learning_rate": 7.631662303445267e-06, "loss": 2.3173, "step": 4049500 }, { "epoch": 8.4779113775664, "grad_norm": 18.02851104736328, "learning_rate": 7.626418046954975e-06, "loss": 2.3449, "step": 4050000 }, { "epoch": 8.478958033292026, "grad_norm": 17.85140037536621, "learning_rate": 7.621173790464683e-06, "loss": 2.3176, "step": 4050500 }, { "epoch": 8.480004689017651, "grad_norm": 17.94702911376953, "learning_rate": 7.615929533974392e-06, "loss": 2.3328, "step": 4051000 }, { "epoch": 8.481051344743276, "grad_norm": 22.062654495239258, "learning_rate": 7.6106852774841e-06, "loss": 2.3186, "step": 4051500 }, { "epoch": 8.482098000468902, "grad_norm": 23.2300968170166, "learning_rate": 7.605441020993807e-06, "loss": 2.3258, "step": 4052000 }, { "epoch": 8.483144656194527, "grad_norm": 17.877227783203125, "learning_rate": 7.6001967645035156e-06, "loss": 2.3195, "step": 4052500 }, { "epoch": 8.484191311920153, "grad_norm": 17.973257064819336, "learning_rate": 7.594952508013224e-06, "loss": 2.3115, "step": 4053000 }, { "epoch": 8.485237967645778, "grad_norm": 21.135570526123047, "learning_rate": 7.589708251522933e-06, "loss": 2.3, "step": 4053500 }, { "epoch": 8.486284623371404, "grad_norm": 21.050491333007812, "learning_rate": 7.584463995032641e-06, "loss": 2.3142, "step": 4054000 }, { "epoch": 8.48733127909703, "grad_norm": 18.185611724853516, "learning_rate": 7.57921973854235e-06, "loss": 2.326, "step": 4054500 }, { "epoch": 8.488377934822655, "grad_norm": 20.858844757080078, "learning_rate": 7.5739754820520566e-06, "loss": 2.3186, "step": 4055000 }, { "epoch": 8.48942459054828, "grad_norm": 20.652368545532227, "learning_rate": 7.568731225561765e-06, "loss": 2.3233, "step": 4055500 }, { "epoch": 8.490471246273906, "grad_norm": 25.03843879699707, "learning_rate": 7.563486969071474e-06, "loss": 2.3378, "step": 4056000 }, { "epoch": 8.491517901999531, "grad_norm": 21.110637664794922, "learning_rate": 7.558242712581181e-06, "loss": 2.333, "step": 4056500 }, { "epoch": 8.492564557725157, "grad_norm": 20.060916900634766, "learning_rate": 7.55299845609089e-06, "loss": 2.3111, "step": 4057000 }, { "epoch": 8.493611213450782, "grad_norm": 17.196502685546875, "learning_rate": 7.547754199600597e-06, "loss": 2.3284, "step": 4057500 }, { "epoch": 8.494657869176407, "grad_norm": 20.45981788635254, "learning_rate": 7.542509943110305e-06, "loss": 2.3197, "step": 4058000 }, { "epoch": 8.495704524902033, "grad_norm": 19.323762893676758, "learning_rate": 7.537265686620014e-06, "loss": 2.3089, "step": 4058500 }, { "epoch": 8.496751180627658, "grad_norm": 17.456228256225586, "learning_rate": 7.532021430129722e-06, "loss": 2.3203, "step": 4059000 }, { "epoch": 8.497797836353284, "grad_norm": 18.958776473999023, "learning_rate": 7.526777173639431e-06, "loss": 2.319, "step": 4059500 }, { "epoch": 8.49884449207891, "grad_norm": 16.503015518188477, "learning_rate": 7.521532917149139e-06, "loss": 2.31, "step": 4060000 }, { "epoch": 8.499891147804535, "grad_norm": 20.3102970123291, "learning_rate": 7.516288660658846e-06, "loss": 2.3126, "step": 4060500 }, { "epoch": 8.50093780353016, "grad_norm": 18.58310890197754, "learning_rate": 7.511044404168555e-06, "loss": 2.33, "step": 4061000 }, { "epoch": 8.501984459255786, "grad_norm": 16.717941284179688, "learning_rate": 7.505800147678263e-06, "loss": 2.3146, "step": 4061500 }, { "epoch": 8.503031114981411, "grad_norm": 20.39862632751465, "learning_rate": 7.500555891187971e-06, "loss": 2.3211, "step": 4062000 }, { "epoch": 8.504077770707037, "grad_norm": 22.231225967407227, "learning_rate": 7.4953116346976796e-06, "loss": 2.3203, "step": 4062500 }, { "epoch": 8.505124426432662, "grad_norm": 19.12047004699707, "learning_rate": 7.490067378207388e-06, "loss": 2.3349, "step": 4063000 }, { "epoch": 8.506171082158287, "grad_norm": 21.03373908996582, "learning_rate": 7.484823121717095e-06, "loss": 2.3318, "step": 4063500 }, { "epoch": 8.507217737883913, "grad_norm": 17.252208709716797, "learning_rate": 7.4795788652268035e-06, "loss": 2.3187, "step": 4064000 }, { "epoch": 8.508264393609538, "grad_norm": 19.59552764892578, "learning_rate": 7.474334608736512e-06, "loss": 2.313, "step": 4064500 }, { "epoch": 8.509311049335164, "grad_norm": 20.729745864868164, "learning_rate": 7.4690903522462205e-06, "loss": 2.3119, "step": 4065000 }, { "epoch": 8.51035770506079, "grad_norm": 23.4283504486084, "learning_rate": 7.463846095755929e-06, "loss": 2.3205, "step": 4065500 }, { "epoch": 8.511404360786415, "grad_norm": 16.497119903564453, "learning_rate": 7.458601839265638e-06, "loss": 2.3296, "step": 4066000 }, { "epoch": 8.51245101651204, "grad_norm": 19.488645553588867, "learning_rate": 7.4533575827753445e-06, "loss": 2.329, "step": 4066500 }, { "epoch": 8.513497672237666, "grad_norm": 19.38746452331543, "learning_rate": 7.448113326285053e-06, "loss": 2.3203, "step": 4067000 }, { "epoch": 8.514544327963291, "grad_norm": 19.746212005615234, "learning_rate": 7.4428690697947615e-06, "loss": 2.3124, "step": 4067500 }, { "epoch": 8.515590983688917, "grad_norm": 19.939197540283203, "learning_rate": 7.437624813304469e-06, "loss": 2.302, "step": 4068000 }, { "epoch": 8.516637639414542, "grad_norm": 23.222299575805664, "learning_rate": 7.432380556814178e-06, "loss": 2.3335, "step": 4068500 }, { "epoch": 8.517684295140167, "grad_norm": 27.170246124267578, "learning_rate": 7.427136300323885e-06, "loss": 2.3271, "step": 4069000 }, { "epoch": 8.518730950865793, "grad_norm": 19.131383895874023, "learning_rate": 7.421892043833593e-06, "loss": 2.335, "step": 4069500 }, { "epoch": 8.519777606591418, "grad_norm": 16.56690216064453, "learning_rate": 7.416647787343302e-06, "loss": 2.3096, "step": 4070000 }, { "epoch": 8.520824262317044, "grad_norm": 20.41043472290039, "learning_rate": 7.41140353085301e-06, "loss": 2.2997, "step": 4070500 }, { "epoch": 8.52187091804267, "grad_norm": 20.09208106994629, "learning_rate": 7.406159274362719e-06, "loss": 2.3076, "step": 4071000 }, { "epoch": 8.522917573768295, "grad_norm": 23.157522201538086, "learning_rate": 7.400915017872427e-06, "loss": 2.3122, "step": 4071500 }, { "epoch": 8.52396422949392, "grad_norm": 18.241914749145508, "learning_rate": 7.395670761382134e-06, "loss": 2.3167, "step": 4072000 }, { "epoch": 8.525010885219547, "grad_norm": 19.09385871887207, "learning_rate": 7.390426504891843e-06, "loss": 2.3255, "step": 4072500 }, { "epoch": 8.526057540945171, "grad_norm": 17.97538948059082, "learning_rate": 7.385182248401551e-06, "loss": 2.3135, "step": 4073000 }, { "epoch": 8.527104196670798, "grad_norm": 19.98259162902832, "learning_rate": 7.379937991911259e-06, "loss": 2.3076, "step": 4073500 }, { "epoch": 8.528150852396422, "grad_norm": 17.681859970092773, "learning_rate": 7.3746937354209675e-06, "loss": 2.315, "step": 4074000 }, { "epoch": 8.52919750812205, "grad_norm": 20.322887420654297, "learning_rate": 7.369449478930676e-06, "loss": 2.3218, "step": 4074500 }, { "epoch": 8.530244163847675, "grad_norm": 21.83855438232422, "learning_rate": 7.364205222440383e-06, "loss": 2.3127, "step": 4075000 }, { "epoch": 8.5312908195733, "grad_norm": 23.303009033203125, "learning_rate": 7.358960965950091e-06, "loss": 2.3275, "step": 4075500 }, { "epoch": 8.532337475298926, "grad_norm": 21.004987716674805, "learning_rate": 7.3537167094598e-06, "loss": 2.3256, "step": 4076000 }, { "epoch": 8.533384131024551, "grad_norm": 17.95092010498047, "learning_rate": 7.3484724529695085e-06, "loss": 2.3309, "step": 4076500 }, { "epoch": 8.534430786750177, "grad_norm": 19.52182960510254, "learning_rate": 7.343228196479217e-06, "loss": 2.3275, "step": 4077000 }, { "epoch": 8.535477442475802, "grad_norm": 17.926977157592773, "learning_rate": 7.337983939988924e-06, "loss": 2.3254, "step": 4077500 }, { "epoch": 8.536524098201427, "grad_norm": 20.174135208129883, "learning_rate": 7.332739683498632e-06, "loss": 2.3166, "step": 4078000 }, { "epoch": 8.537570753927053, "grad_norm": 18.604183197021484, "learning_rate": 7.327495427008341e-06, "loss": 2.3063, "step": 4078500 }, { "epoch": 8.538617409652678, "grad_norm": 17.790348052978516, "learning_rate": 7.3222511705180495e-06, "loss": 2.3128, "step": 4079000 }, { "epoch": 8.539664065378304, "grad_norm": 17.441604614257812, "learning_rate": 7.317006914027757e-06, "loss": 2.3115, "step": 4079500 }, { "epoch": 8.54071072110393, "grad_norm": 18.168649673461914, "learning_rate": 7.311762657537466e-06, "loss": 2.3227, "step": 4080000 }, { "epoch": 8.541757376829555, "grad_norm": 21.205652236938477, "learning_rate": 7.3065184010471725e-06, "loss": 2.34, "step": 4080500 }, { "epoch": 8.54280403255518, "grad_norm": 16.70974349975586, "learning_rate": 7.301274144556881e-06, "loss": 2.335, "step": 4081000 }, { "epoch": 8.543850688280806, "grad_norm": 25.301982879638672, "learning_rate": 7.29602988806659e-06, "loss": 2.3343, "step": 4081500 }, { "epoch": 8.544897344006431, "grad_norm": 23.780616760253906, "learning_rate": 7.290785631576298e-06, "loss": 2.3, "step": 4082000 }, { "epoch": 8.545943999732057, "grad_norm": 17.51781463623047, "learning_rate": 7.285541375086007e-06, "loss": 2.3159, "step": 4082500 }, { "epoch": 8.546990655457682, "grad_norm": 18.981672286987305, "learning_rate": 7.280297118595715e-06, "loss": 2.3401, "step": 4083000 }, { "epoch": 8.548037311183307, "grad_norm": 17.184696197509766, "learning_rate": 7.275052862105422e-06, "loss": 2.3458, "step": 4083500 }, { "epoch": 8.549083966908933, "grad_norm": 18.943050384521484, "learning_rate": 7.269808605615131e-06, "loss": 2.3215, "step": 4084000 }, { "epoch": 8.550130622634558, "grad_norm": 26.725858688354492, "learning_rate": 7.264564349124839e-06, "loss": 2.319, "step": 4084500 }, { "epoch": 8.551177278360184, "grad_norm": 20.17902946472168, "learning_rate": 7.259320092634548e-06, "loss": 2.3178, "step": 4085000 }, { "epoch": 8.55222393408581, "grad_norm": 19.219661712646484, "learning_rate": 7.254075836144255e-06, "loss": 2.3164, "step": 4085500 }, { "epoch": 8.553270589811435, "grad_norm": 17.45247459411621, "learning_rate": 7.248831579653963e-06, "loss": 2.3067, "step": 4086000 }, { "epoch": 8.55431724553706, "grad_norm": 18.1339111328125, "learning_rate": 7.243587323163671e-06, "loss": 2.2995, "step": 4086500 }, { "epoch": 8.555363901262686, "grad_norm": 20.0684757232666, "learning_rate": 7.238343066673379e-06, "loss": 2.3253, "step": 4087000 }, { "epoch": 8.556410556988311, "grad_norm": 23.26561737060547, "learning_rate": 7.233098810183088e-06, "loss": 2.294, "step": 4087500 }, { "epoch": 8.557457212713937, "grad_norm": 21.912748336791992, "learning_rate": 7.227854553692796e-06, "loss": 2.3139, "step": 4088000 }, { "epoch": 8.558503868439562, "grad_norm": 17.595319747924805, "learning_rate": 7.222610297202505e-06, "loss": 2.3181, "step": 4088500 }, { "epoch": 8.559550524165187, "grad_norm": 16.919172286987305, "learning_rate": 7.217366040712212e-06, "loss": 2.3008, "step": 4089000 }, { "epoch": 8.560597179890813, "grad_norm": 20.798208236694336, "learning_rate": 7.21212178422192e-06, "loss": 2.3181, "step": 4089500 }, { "epoch": 8.561643835616438, "grad_norm": 18.539907455444336, "learning_rate": 7.206877527731629e-06, "loss": 2.3152, "step": 4090000 }, { "epoch": 8.562690491342064, "grad_norm": 24.230924606323242, "learning_rate": 7.201633271241337e-06, "loss": 2.2958, "step": 4090500 }, { "epoch": 8.56373714706769, "grad_norm": 19.295087814331055, "learning_rate": 7.196389014751045e-06, "loss": 2.3123, "step": 4091000 }, { "epoch": 8.564783802793315, "grad_norm": 21.36899185180664, "learning_rate": 7.191144758260754e-06, "loss": 2.2977, "step": 4091500 }, { "epoch": 8.56583045851894, "grad_norm": 19.95661735534668, "learning_rate": 7.185900501770461e-06, "loss": 2.2969, "step": 4092000 }, { "epoch": 8.566877114244566, "grad_norm": 17.598793029785156, "learning_rate": 7.180656245280169e-06, "loss": 2.3355, "step": 4092500 }, { "epoch": 8.567923769970191, "grad_norm": 19.106782913208008, "learning_rate": 7.1754119887898775e-06, "loss": 2.3207, "step": 4093000 }, { "epoch": 8.568970425695817, "grad_norm": 18.300996780395508, "learning_rate": 7.170167732299586e-06, "loss": 2.3074, "step": 4093500 }, { "epoch": 8.570017081421442, "grad_norm": 18.93960189819336, "learning_rate": 7.164923475809295e-06, "loss": 2.3115, "step": 4094000 }, { "epoch": 8.571063737147067, "grad_norm": 16.858543395996094, "learning_rate": 7.1596792193190014e-06, "loss": 2.3312, "step": 4094500 }, { "epoch": 8.572110392872693, "grad_norm": 19.03811264038086, "learning_rate": 7.15443496282871e-06, "loss": 2.3048, "step": 4095000 }, { "epoch": 8.573157048598318, "grad_norm": 18.289506912231445, "learning_rate": 7.1491907063384185e-06, "loss": 2.3095, "step": 4095500 }, { "epoch": 8.574203704323944, "grad_norm": 17.487171173095703, "learning_rate": 7.143946449848127e-06, "loss": 2.3261, "step": 4096000 }, { "epoch": 8.57525036004957, "grad_norm": 20.112653732299805, "learning_rate": 7.138702193357836e-06, "loss": 2.3315, "step": 4096500 }, { "epoch": 8.576297015775195, "grad_norm": 24.669952392578125, "learning_rate": 7.133457936867543e-06, "loss": 2.3186, "step": 4097000 }, { "epoch": 8.57734367150082, "grad_norm": 21.394128799438477, "learning_rate": 7.128213680377251e-06, "loss": 2.3154, "step": 4097500 }, { "epoch": 8.578390327226446, "grad_norm": 16.98015022277832, "learning_rate": 7.122969423886959e-06, "loss": 2.3119, "step": 4098000 }, { "epoch": 8.579436982952071, "grad_norm": 21.031715393066406, "learning_rate": 7.117725167396667e-06, "loss": 2.3273, "step": 4098500 }, { "epoch": 8.580483638677697, "grad_norm": 20.873367309570312, "learning_rate": 7.112480910906376e-06, "loss": 2.3134, "step": 4099000 }, { "epoch": 8.581530294403322, "grad_norm": 17.969070434570312, "learning_rate": 7.107236654416084e-06, "loss": 2.3131, "step": 4099500 }, { "epoch": 8.582576950128948, "grad_norm": 19.470413208007812, "learning_rate": 7.101992397925793e-06, "loss": 2.2901, "step": 4100000 }, { "epoch": 8.583623605854573, "grad_norm": 20.108179092407227, "learning_rate": 7.0967481414355e-06, "loss": 2.3044, "step": 4100500 }, { "epoch": 8.584670261580198, "grad_norm": 19.33083724975586, "learning_rate": 7.091503884945208e-06, "loss": 2.3032, "step": 4101000 }, { "epoch": 8.585716917305824, "grad_norm": 17.089841842651367, "learning_rate": 7.086259628454917e-06, "loss": 2.3162, "step": 4101500 }, { "epoch": 8.58676357303145, "grad_norm": 20.70647430419922, "learning_rate": 7.081015371964625e-06, "loss": 2.3072, "step": 4102000 }, { "epoch": 8.587810228757075, "grad_norm": 24.04594612121582, "learning_rate": 7.075771115474333e-06, "loss": 2.3329, "step": 4102500 }, { "epoch": 8.5888568844827, "grad_norm": 24.85735511779785, "learning_rate": 7.070526858984041e-06, "loss": 2.3171, "step": 4103000 }, { "epoch": 8.589903540208326, "grad_norm": 20.227066040039062, "learning_rate": 7.065282602493749e-06, "loss": 2.3112, "step": 4103500 }, { "epoch": 8.590950195933951, "grad_norm": 19.25640869140625, "learning_rate": 7.060038346003457e-06, "loss": 2.312, "step": 4104000 }, { "epoch": 8.591996851659577, "grad_norm": 17.274372100830078, "learning_rate": 7.054794089513165e-06, "loss": 2.3128, "step": 4104500 }, { "epoch": 8.593043507385202, "grad_norm": 19.303150177001953, "learning_rate": 7.049549833022874e-06, "loss": 2.3037, "step": 4105000 }, { "epoch": 8.594090163110828, "grad_norm": 19.568641662597656, "learning_rate": 7.0443055765325825e-06, "loss": 2.3167, "step": 4105500 }, { "epoch": 8.595136818836453, "grad_norm": 21.75440788269043, "learning_rate": 7.039061320042289e-06, "loss": 2.3134, "step": 4106000 }, { "epoch": 8.596183474562078, "grad_norm": 20.605615615844727, "learning_rate": 7.033817063551998e-06, "loss": 2.3185, "step": 4106500 }, { "epoch": 8.597230130287706, "grad_norm": 19.870311737060547, "learning_rate": 7.028572807061706e-06, "loss": 2.3188, "step": 4107000 }, { "epoch": 8.59827678601333, "grad_norm": 19.369888305664062, "learning_rate": 7.023328550571415e-06, "loss": 2.3153, "step": 4107500 }, { "epoch": 8.599323441738957, "grad_norm": 21.36354637145996, "learning_rate": 7.0180842940811235e-06, "loss": 2.3077, "step": 4108000 }, { "epoch": 8.60037009746458, "grad_norm": 22.80512809753418, "learning_rate": 7.012840037590831e-06, "loss": 2.3156, "step": 4108500 }, { "epoch": 8.601416753190207, "grad_norm": 21.676654815673828, "learning_rate": 7.007595781100539e-06, "loss": 2.3248, "step": 4109000 }, { "epoch": 8.602463408915833, "grad_norm": 18.39932632446289, "learning_rate": 7.0023515246102466e-06, "loss": 2.3203, "step": 4109500 }, { "epoch": 8.603510064641458, "grad_norm": 20.10652732849121, "learning_rate": 6.997107268119955e-06, "loss": 2.3207, "step": 4110000 }, { "epoch": 8.604556720367084, "grad_norm": 18.991628646850586, "learning_rate": 6.991863011629664e-06, "loss": 2.3027, "step": 4110500 }, { "epoch": 8.60560337609271, "grad_norm": 18.519939422607422, "learning_rate": 6.986618755139372e-06, "loss": 2.3141, "step": 4111000 }, { "epoch": 8.606650031818335, "grad_norm": 20.29377555847168, "learning_rate": 6.981374498649079e-06, "loss": 2.3282, "step": 4111500 }, { "epoch": 8.60769668754396, "grad_norm": 21.11332130432129, "learning_rate": 6.9761302421587876e-06, "loss": 2.3387, "step": 4112000 }, { "epoch": 8.608743343269586, "grad_norm": 17.071475982666016, "learning_rate": 6.970885985668496e-06, "loss": 2.3229, "step": 4112500 }, { "epoch": 8.609789998995211, "grad_norm": 35.639251708984375, "learning_rate": 6.965641729178205e-06, "loss": 2.3061, "step": 4113000 }, { "epoch": 8.610836654720837, "grad_norm": 19.324716567993164, "learning_rate": 6.960397472687913e-06, "loss": 2.315, "step": 4113500 }, { "epoch": 8.611883310446462, "grad_norm": 22.31117820739746, "learning_rate": 6.955153216197621e-06, "loss": 2.3118, "step": 4114000 }, { "epoch": 8.612929966172088, "grad_norm": 21.813617706298828, "learning_rate": 6.9499089597073286e-06, "loss": 2.3063, "step": 4114500 }, { "epoch": 8.613976621897713, "grad_norm": 17.581562042236328, "learning_rate": 6.944664703217037e-06, "loss": 2.3292, "step": 4115000 }, { "epoch": 8.615023277623338, "grad_norm": 17.5248966217041, "learning_rate": 6.939420446726745e-06, "loss": 2.3274, "step": 4115500 }, { "epoch": 8.616069933348964, "grad_norm": 21.151750564575195, "learning_rate": 6.934176190236453e-06, "loss": 2.2993, "step": 4116000 }, { "epoch": 8.61711658907459, "grad_norm": 19.078750610351562, "learning_rate": 6.928931933746162e-06, "loss": 2.3017, "step": 4116500 }, { "epoch": 8.618163244800215, "grad_norm": 19.964405059814453, "learning_rate": 6.92368767725587e-06, "loss": 2.3204, "step": 4117000 }, { "epoch": 8.61920990052584, "grad_norm": 19.8287353515625, "learning_rate": 6.918443420765577e-06, "loss": 2.3147, "step": 4117500 }, { "epoch": 8.620256556251466, "grad_norm": 17.168947219848633, "learning_rate": 6.913199164275286e-06, "loss": 2.2957, "step": 4118000 }, { "epoch": 8.621303211977091, "grad_norm": 21.014907836914062, "learning_rate": 6.907954907784994e-06, "loss": 2.3154, "step": 4118500 }, { "epoch": 8.622349867702717, "grad_norm": 19.33572006225586, "learning_rate": 6.902710651294703e-06, "loss": 2.3167, "step": 4119000 }, { "epoch": 8.623396523428342, "grad_norm": 22.508989334106445, "learning_rate": 6.897466394804411e-06, "loss": 2.3177, "step": 4119500 }, { "epoch": 8.624443179153968, "grad_norm": 20.538328170776367, "learning_rate": 6.892222138314118e-06, "loss": 2.3244, "step": 4120000 }, { "epoch": 8.625489834879593, "grad_norm": 17.90523338317871, "learning_rate": 6.886977881823827e-06, "loss": 2.3149, "step": 4120500 }, { "epoch": 8.626536490605218, "grad_norm": 21.80126953125, "learning_rate": 6.8817336253335345e-06, "loss": 2.313, "step": 4121000 }, { "epoch": 8.627583146330844, "grad_norm": 19.358474731445312, "learning_rate": 6.876489368843243e-06, "loss": 2.3072, "step": 4121500 }, { "epoch": 8.62862980205647, "grad_norm": 18.444664001464844, "learning_rate": 6.8712451123529516e-06, "loss": 2.3068, "step": 4122000 }, { "epoch": 8.629676457782095, "grad_norm": 21.144134521484375, "learning_rate": 6.86600085586266e-06, "loss": 2.3276, "step": 4122500 }, { "epoch": 8.63072311350772, "grad_norm": 18.99525260925293, "learning_rate": 6.860756599372367e-06, "loss": 2.316, "step": 4123000 }, { "epoch": 8.631769769233346, "grad_norm": 22.056241989135742, "learning_rate": 6.8555123428820755e-06, "loss": 2.3099, "step": 4123500 }, { "epoch": 8.632816424958971, "grad_norm": 19.045387268066406, "learning_rate": 6.850268086391784e-06, "loss": 2.3062, "step": 4124000 }, { "epoch": 8.633863080684597, "grad_norm": 16.220834732055664, "learning_rate": 6.8450238299014926e-06, "loss": 2.3159, "step": 4124500 }, { "epoch": 8.634909736410222, "grad_norm": 20.435617446899414, "learning_rate": 6.839779573411201e-06, "loss": 2.3135, "step": 4125000 }, { "epoch": 8.635956392135848, "grad_norm": 17.23980712890625, "learning_rate": 6.834535316920909e-06, "loss": 2.3129, "step": 4125500 }, { "epoch": 8.637003047861473, "grad_norm": 19.341463088989258, "learning_rate": 6.8292910604306165e-06, "loss": 2.318, "step": 4126000 }, { "epoch": 8.638049703587098, "grad_norm": 28.44196319580078, "learning_rate": 6.824046803940325e-06, "loss": 2.2836, "step": 4126500 }, { "epoch": 8.639096359312724, "grad_norm": 18.557544708251953, "learning_rate": 6.818802547450033e-06, "loss": 2.3033, "step": 4127000 }, { "epoch": 8.64014301503835, "grad_norm": 19.236406326293945, "learning_rate": 6.813558290959741e-06, "loss": 2.3077, "step": 4127500 }, { "epoch": 8.641189670763975, "grad_norm": 17.360538482666016, "learning_rate": 6.80831403446945e-06, "loss": 2.3002, "step": 4128000 }, { "epoch": 8.6422363264896, "grad_norm": 20.253429412841797, "learning_rate": 6.803069777979157e-06, "loss": 2.3056, "step": 4128500 }, { "epoch": 8.643282982215226, "grad_norm": 17.664657592773438, "learning_rate": 6.797825521488865e-06, "loss": 2.3138, "step": 4129000 }, { "epoch": 8.644329637940851, "grad_norm": 17.25895118713379, "learning_rate": 6.792581264998574e-06, "loss": 2.3155, "step": 4129500 }, { "epoch": 8.645376293666477, "grad_norm": 18.78516960144043, "learning_rate": 6.787337008508282e-06, "loss": 2.3247, "step": 4130000 }, { "epoch": 8.646422949392102, "grad_norm": 21.164100646972656, "learning_rate": 6.782092752017991e-06, "loss": 2.312, "step": 4130500 }, { "epoch": 8.647469605117728, "grad_norm": 22.583789825439453, "learning_rate": 6.776848495527699e-06, "loss": 2.3043, "step": 4131000 }, { "epoch": 8.648516260843353, "grad_norm": 16.604257583618164, "learning_rate": 6.771604239037406e-06, "loss": 2.3119, "step": 4131500 }, { "epoch": 8.649562916568978, "grad_norm": 17.999914169311523, "learning_rate": 6.766359982547115e-06, "loss": 2.2935, "step": 4132000 }, { "epoch": 8.650609572294604, "grad_norm": 17.909202575683594, "learning_rate": 6.761115726056822e-06, "loss": 2.3196, "step": 4132500 }, { "epoch": 8.65165622802023, "grad_norm": 18.72494125366211, "learning_rate": 6.755871469566531e-06, "loss": 2.3117, "step": 4133000 }, { "epoch": 8.652702883745855, "grad_norm": 21.679922103881836, "learning_rate": 6.7506272130762395e-06, "loss": 2.3088, "step": 4133500 }, { "epoch": 8.65374953947148, "grad_norm": 16.070240020751953, "learning_rate": 6.745382956585948e-06, "loss": 2.3306, "step": 4134000 }, { "epoch": 8.654796195197106, "grad_norm": 19.72726058959961, "learning_rate": 6.740138700095655e-06, "loss": 2.3079, "step": 4134500 }, { "epoch": 8.655842850922731, "grad_norm": 17.89316749572754, "learning_rate": 6.734894443605363e-06, "loss": 2.3009, "step": 4135000 }, { "epoch": 8.656889506648357, "grad_norm": 18.502103805541992, "learning_rate": 6.729650187115072e-06, "loss": 2.3037, "step": 4135500 }, { "epoch": 8.657936162373982, "grad_norm": 19.086963653564453, "learning_rate": 6.7244059306247805e-06, "loss": 2.296, "step": 4136000 }, { "epoch": 8.658982818099608, "grad_norm": 18.19300651550293, "learning_rate": 6.719161674134489e-06, "loss": 2.3181, "step": 4136500 }, { "epoch": 8.660029473825233, "grad_norm": 17.615535736083984, "learning_rate": 6.713917417644196e-06, "loss": 2.304, "step": 4137000 }, { "epoch": 8.661076129550858, "grad_norm": 21.713172912597656, "learning_rate": 6.708673161153904e-06, "loss": 2.3186, "step": 4137500 }, { "epoch": 8.662122785276484, "grad_norm": 20.980173110961914, "learning_rate": 6.703428904663613e-06, "loss": 2.301, "step": 4138000 }, { "epoch": 8.66316944100211, "grad_norm": 18.850332260131836, "learning_rate": 6.698184648173321e-06, "loss": 2.3006, "step": 4138500 }, { "epoch": 8.664216096727735, "grad_norm": 20.37605094909668, "learning_rate": 6.692940391683029e-06, "loss": 2.3097, "step": 4139000 }, { "epoch": 8.66526275245336, "grad_norm": 20.965612411499023, "learning_rate": 6.687696135192738e-06, "loss": 2.3189, "step": 4139500 }, { "epoch": 8.666309408178986, "grad_norm": 21.1893310546875, "learning_rate": 6.6824518787024445e-06, "loss": 2.3157, "step": 4140000 }, { "epoch": 8.667356063904611, "grad_norm": 18.874914169311523, "learning_rate": 6.677207622212153e-06, "loss": 2.3057, "step": 4140500 }, { "epoch": 8.668402719630237, "grad_norm": 23.268709182739258, "learning_rate": 6.671963365721862e-06, "loss": 2.3266, "step": 4141000 }, { "epoch": 8.669449375355862, "grad_norm": 17.192310333251953, "learning_rate": 6.66671910923157e-06, "loss": 2.3258, "step": 4141500 }, { "epoch": 8.670496031081488, "grad_norm": 18.126605987548828, "learning_rate": 6.661474852741279e-06, "loss": 2.3109, "step": 4142000 }, { "epoch": 8.671542686807115, "grad_norm": 17.368324279785156, "learning_rate": 6.656230596250987e-06, "loss": 2.3137, "step": 4142500 }, { "epoch": 8.672589342532738, "grad_norm": 20.023237228393555, "learning_rate": 6.650986339760694e-06, "loss": 2.3058, "step": 4143000 }, { "epoch": 8.673635998258366, "grad_norm": 17.91573143005371, "learning_rate": 6.645742083270403e-06, "loss": 2.3087, "step": 4143500 }, { "epoch": 8.67468265398399, "grad_norm": 17.257511138916016, "learning_rate": 6.640497826780111e-06, "loss": 2.3245, "step": 4144000 }, { "epoch": 8.675729309709617, "grad_norm": 15.808295249938965, "learning_rate": 6.635253570289819e-06, "loss": 2.3049, "step": 4144500 }, { "epoch": 8.676775965435242, "grad_norm": 19.367807388305664, "learning_rate": 6.630009313799527e-06, "loss": 2.31, "step": 4145000 }, { "epoch": 8.677822621160868, "grad_norm": 15.700664520263672, "learning_rate": 6.624765057309234e-06, "loss": 2.3019, "step": 4145500 }, { "epoch": 8.678869276886493, "grad_norm": 19.99297523498535, "learning_rate": 6.619520800818943e-06, "loss": 2.2892, "step": 4146000 }, { "epoch": 8.679915932612118, "grad_norm": 24.270896911621094, "learning_rate": 6.614276544328651e-06, "loss": 2.3017, "step": 4146500 }, { "epoch": 8.680962588337744, "grad_norm": 22.486703872680664, "learning_rate": 6.60903228783836e-06, "loss": 2.3144, "step": 4147000 }, { "epoch": 8.68200924406337, "grad_norm": 17.931636810302734, "learning_rate": 6.603788031348068e-06, "loss": 2.3007, "step": 4147500 }, { "epoch": 8.683055899788995, "grad_norm": 18.49884033203125, "learning_rate": 6.598543774857777e-06, "loss": 2.3048, "step": 4148000 }, { "epoch": 8.68410255551462, "grad_norm": 20.927810668945312, "learning_rate": 6.593299518367484e-06, "loss": 2.301, "step": 4148500 }, { "epoch": 8.685149211240246, "grad_norm": 20.39229965209961, "learning_rate": 6.588055261877192e-06, "loss": 2.3161, "step": 4149000 }, { "epoch": 8.686195866965871, "grad_norm": 17.918354034423828, "learning_rate": 6.582811005386901e-06, "loss": 2.3054, "step": 4149500 }, { "epoch": 8.687242522691497, "grad_norm": 17.627906799316406, "learning_rate": 6.5775667488966085e-06, "loss": 2.2943, "step": 4150000 }, { "epoch": 8.688289178417122, "grad_norm": 19.471702575683594, "learning_rate": 6.572322492406317e-06, "loss": 2.3097, "step": 4150500 }, { "epoch": 8.689335834142748, "grad_norm": 23.0517520904541, "learning_rate": 6.567078235916026e-06, "loss": 2.3084, "step": 4151000 }, { "epoch": 8.690382489868373, "grad_norm": 16.327585220336914, "learning_rate": 6.5618339794257324e-06, "loss": 2.3202, "step": 4151500 }, { "epoch": 8.691429145593998, "grad_norm": 16.733400344848633, "learning_rate": 6.556589722935441e-06, "loss": 2.3261, "step": 4152000 }, { "epoch": 8.692475801319624, "grad_norm": 18.879150390625, "learning_rate": 6.5513454664451495e-06, "loss": 2.2986, "step": 4152500 }, { "epoch": 8.69352245704525, "grad_norm": 23.013813018798828, "learning_rate": 6.546101209954858e-06, "loss": 2.3275, "step": 4153000 }, { "epoch": 8.694569112770875, "grad_norm": 19.16704750061035, "learning_rate": 6.540856953464567e-06, "loss": 2.3083, "step": 4153500 }, { "epoch": 8.6956157684965, "grad_norm": 20.935165405273438, "learning_rate": 6.5356126969742734e-06, "loss": 2.3026, "step": 4154000 }, { "epoch": 8.696662424222126, "grad_norm": 22.412063598632812, "learning_rate": 6.530368440483982e-06, "loss": 2.309, "step": 4154500 }, { "epoch": 8.697709079947751, "grad_norm": 20.661426544189453, "learning_rate": 6.5251241839936905e-06, "loss": 2.3139, "step": 4155000 }, { "epoch": 8.698755735673377, "grad_norm": 18.21918296813965, "learning_rate": 6.519879927503399e-06, "loss": 2.2992, "step": 4155500 }, { "epoch": 8.699802391399002, "grad_norm": 19.949670791625977, "learning_rate": 6.514635671013107e-06, "loss": 2.3075, "step": 4156000 }, { "epoch": 8.700849047124628, "grad_norm": 20.783884048461914, "learning_rate": 6.509391414522815e-06, "loss": 2.319, "step": 4156500 }, { "epoch": 8.701895702850253, "grad_norm": 19.284210205078125, "learning_rate": 6.504147158032522e-06, "loss": 2.3169, "step": 4157000 }, { "epoch": 8.702942358575879, "grad_norm": 18.756893157958984, "learning_rate": 6.498902901542231e-06, "loss": 2.3064, "step": 4157500 }, { "epoch": 8.703989014301504, "grad_norm": 18.639108657836914, "learning_rate": 6.493658645051939e-06, "loss": 2.293, "step": 4158000 }, { "epoch": 8.70503567002713, "grad_norm": 18.656147003173828, "learning_rate": 6.488414388561648e-06, "loss": 2.3142, "step": 4158500 }, { "epoch": 8.706082325752755, "grad_norm": 21.979202270507812, "learning_rate": 6.483170132071356e-06, "loss": 2.3016, "step": 4159000 }, { "epoch": 8.70712898147838, "grad_norm": 19.803455352783203, "learning_rate": 6.477925875581065e-06, "loss": 2.2987, "step": 4159500 }, { "epoch": 8.708175637204006, "grad_norm": 16.89605140686035, "learning_rate": 6.472681619090772e-06, "loss": 2.3096, "step": 4160000 }, { "epoch": 8.709222292929631, "grad_norm": 18.517545700073242, "learning_rate": 6.46743736260048e-06, "loss": 2.3201, "step": 4160500 }, { "epoch": 8.710268948655257, "grad_norm": 18.022705078125, "learning_rate": 6.462193106110189e-06, "loss": 2.3216, "step": 4161000 }, { "epoch": 8.711315604380882, "grad_norm": 19.895423889160156, "learning_rate": 6.4569488496198964e-06, "loss": 2.3128, "step": 4161500 }, { "epoch": 8.712362260106508, "grad_norm": 21.940242767333984, "learning_rate": 6.451704593129605e-06, "loss": 2.2994, "step": 4162000 }, { "epoch": 8.713408915832133, "grad_norm": 17.027137756347656, "learning_rate": 6.446460336639313e-06, "loss": 2.3381, "step": 4162500 }, { "epoch": 8.714455571557759, "grad_norm": 18.22771644592285, "learning_rate": 6.44121608014902e-06, "loss": 2.3146, "step": 4163000 }, { "epoch": 8.715502227283384, "grad_norm": 19.2067928314209, "learning_rate": 6.435971823658729e-06, "loss": 2.3156, "step": 4163500 }, { "epoch": 8.71654888300901, "grad_norm": 19.918397903442383, "learning_rate": 6.4307275671684374e-06, "loss": 2.3065, "step": 4164000 }, { "epoch": 8.717595538734635, "grad_norm": 17.726924896240234, "learning_rate": 6.425483310678146e-06, "loss": 2.3086, "step": 4164500 }, { "epoch": 8.71864219446026, "grad_norm": 20.858556747436523, "learning_rate": 6.4202390541878545e-06, "loss": 2.3093, "step": 4165000 }, { "epoch": 8.719688850185886, "grad_norm": 20.0575008392334, "learning_rate": 6.414994797697561e-06, "loss": 2.3007, "step": 4165500 }, { "epoch": 8.720735505911511, "grad_norm": 23.53080177307129, "learning_rate": 6.40975054120727e-06, "loss": 2.3166, "step": 4166000 }, { "epoch": 8.721782161637137, "grad_norm": 20.13054084777832, "learning_rate": 6.404506284716978e-06, "loss": 2.2991, "step": 4166500 }, { "epoch": 8.722828817362762, "grad_norm": 18.27471160888672, "learning_rate": 6.399262028226687e-06, "loss": 2.3156, "step": 4167000 }, { "epoch": 8.723875473088388, "grad_norm": 20.18317222595215, "learning_rate": 6.394017771736395e-06, "loss": 2.3074, "step": 4167500 }, { "epoch": 8.724922128814013, "grad_norm": 16.676876068115234, "learning_rate": 6.388773515246103e-06, "loss": 2.2979, "step": 4168000 }, { "epoch": 8.725968784539639, "grad_norm": 15.742939949035645, "learning_rate": 6.38352925875581e-06, "loss": 2.299, "step": 4168500 }, { "epoch": 8.727015440265264, "grad_norm": 18.19965934753418, "learning_rate": 6.3782850022655186e-06, "loss": 2.2997, "step": 4169000 }, { "epoch": 8.72806209599089, "grad_norm": 19.110382080078125, "learning_rate": 6.373040745775227e-06, "loss": 2.308, "step": 4169500 }, { "epoch": 8.729108751716515, "grad_norm": 17.979455947875977, "learning_rate": 6.367796489284936e-06, "loss": 2.3093, "step": 4170000 }, { "epoch": 8.73015540744214, "grad_norm": 22.590961456298828, "learning_rate": 6.362552232794644e-06, "loss": 2.3086, "step": 4170500 }, { "epoch": 8.731202063167766, "grad_norm": 27.983667373657227, "learning_rate": 6.357307976304351e-06, "loss": 2.3116, "step": 4171000 }, { "epoch": 8.732248718893391, "grad_norm": 20.50568199157715, "learning_rate": 6.3520637198140596e-06, "loss": 2.3022, "step": 4171500 }, { "epoch": 8.733295374619017, "grad_norm": 20.50535011291504, "learning_rate": 6.346819463323768e-06, "loss": 2.3132, "step": 4172000 }, { "epoch": 8.734342030344642, "grad_norm": 18.7188720703125, "learning_rate": 6.341575206833477e-06, "loss": 2.3058, "step": 4172500 }, { "epoch": 8.735388686070268, "grad_norm": 20.02104949951172, "learning_rate": 6.336330950343184e-06, "loss": 2.3198, "step": 4173000 }, { "epoch": 8.736435341795893, "grad_norm": 17.85128402709961, "learning_rate": 6.331086693852893e-06, "loss": 2.2961, "step": 4173500 }, { "epoch": 8.737481997521519, "grad_norm": 16.975507736206055, "learning_rate": 6.3258424373626006e-06, "loss": 2.3281, "step": 4174000 }, { "epoch": 8.738528653247144, "grad_norm": 18.219045639038086, "learning_rate": 6.320598180872308e-06, "loss": 2.3215, "step": 4174500 }, { "epoch": 8.73957530897277, "grad_norm": 24.21605682373047, "learning_rate": 6.315353924382017e-06, "loss": 2.3556, "step": 4175000 }, { "epoch": 8.740621964698395, "grad_norm": 19.661468505859375, "learning_rate": 6.310109667891725e-06, "loss": 2.3063, "step": 4175500 }, { "epoch": 8.74166862042402, "grad_norm": 20.970752716064453, "learning_rate": 6.304865411401434e-06, "loss": 2.3061, "step": 4176000 }, { "epoch": 8.742715276149646, "grad_norm": 27.554162979125977, "learning_rate": 6.299621154911142e-06, "loss": 2.3122, "step": 4176500 }, { "epoch": 8.743761931875273, "grad_norm": 17.52740478515625, "learning_rate": 6.294376898420849e-06, "loss": 2.3072, "step": 4177000 }, { "epoch": 8.744808587600897, "grad_norm": 21.832462310791016, "learning_rate": 6.289132641930558e-06, "loss": 2.2951, "step": 4177500 }, { "epoch": 8.745855243326524, "grad_norm": 26.907413482666016, "learning_rate": 6.283888385440266e-06, "loss": 2.3178, "step": 4178000 }, { "epoch": 8.746901899052148, "grad_norm": 18.45553970336914, "learning_rate": 6.278644128949975e-06, "loss": 2.3178, "step": 4178500 }, { "epoch": 8.747948554777775, "grad_norm": 19.132671356201172, "learning_rate": 6.2733998724596826e-06, "loss": 2.3148, "step": 4179000 }, { "epoch": 8.7489952105034, "grad_norm": 20.008352279663086, "learning_rate": 6.26815561596939e-06, "loss": 2.3089, "step": 4179500 }, { "epoch": 8.750041866229026, "grad_norm": 21.690946578979492, "learning_rate": 6.262911359479098e-06, "loss": 2.3128, "step": 4180000 }, { "epoch": 8.751088521954651, "grad_norm": 14.964045524597168, "learning_rate": 6.2576671029888065e-06, "loss": 2.3123, "step": 4180500 }, { "epoch": 8.752135177680277, "grad_norm": 21.060392379760742, "learning_rate": 6.252422846498515e-06, "loss": 2.3117, "step": 4181000 }, { "epoch": 8.753181833405902, "grad_norm": 19.190515518188477, "learning_rate": 6.2471785900082236e-06, "loss": 2.3097, "step": 4181500 }, { "epoch": 8.754228489131528, "grad_norm": 22.726064682006836, "learning_rate": 6.241934333517931e-06, "loss": 2.2986, "step": 4182000 }, { "epoch": 8.755275144857153, "grad_norm": 19.751224517822266, "learning_rate": 6.23669007702764e-06, "loss": 2.318, "step": 4182500 }, { "epoch": 8.756321800582779, "grad_norm": 18.566020965576172, "learning_rate": 6.231445820537348e-06, "loss": 2.3057, "step": 4183000 }, { "epoch": 8.757368456308404, "grad_norm": 17.365053176879883, "learning_rate": 6.226201564047056e-06, "loss": 2.3213, "step": 4183500 }, { "epoch": 8.75841511203403, "grad_norm": 20.000892639160156, "learning_rate": 6.2209573075567646e-06, "loss": 2.2958, "step": 4184000 }, { "epoch": 8.759461767759655, "grad_norm": 20.523534774780273, "learning_rate": 6.215713051066472e-06, "loss": 2.3084, "step": 4184500 }, { "epoch": 8.76050842348528, "grad_norm": 22.53447723388672, "learning_rate": 6.210468794576181e-06, "loss": 2.2991, "step": 4185000 }, { "epoch": 8.761555079210906, "grad_norm": 31.878620147705078, "learning_rate": 6.2052245380858885e-06, "loss": 2.29, "step": 4185500 }, { "epoch": 8.762601734936531, "grad_norm": 19.572383880615234, "learning_rate": 6.199980281595596e-06, "loss": 2.3227, "step": 4186000 }, { "epoch": 8.763648390662157, "grad_norm": 17.964941024780273, "learning_rate": 6.194736025105305e-06, "loss": 2.3106, "step": 4186500 }, { "epoch": 8.764695046387782, "grad_norm": 19.237218856811523, "learning_rate": 6.189491768615013e-06, "loss": 2.3186, "step": 4187000 }, { "epoch": 8.765741702113408, "grad_norm": 19.423404693603516, "learning_rate": 6.184247512124721e-06, "loss": 2.2864, "step": 4187500 }, { "epoch": 8.766788357839033, "grad_norm": 17.972909927368164, "learning_rate": 6.1790032556344295e-06, "loss": 2.2882, "step": 4188000 }, { "epoch": 8.767835013564659, "grad_norm": 18.78215980529785, "learning_rate": 6.173758999144138e-06, "loss": 2.3171, "step": 4188500 }, { "epoch": 8.768881669290284, "grad_norm": 20.085731506347656, "learning_rate": 6.168514742653846e-06, "loss": 2.3219, "step": 4189000 }, { "epoch": 8.76992832501591, "grad_norm": 21.248178482055664, "learning_rate": 6.163270486163554e-06, "loss": 2.3096, "step": 4189500 }, { "epoch": 8.770974980741535, "grad_norm": 15.752341270446777, "learning_rate": 6.158026229673263e-06, "loss": 2.2954, "step": 4190000 }, { "epoch": 8.77202163646716, "grad_norm": 24.49516487121582, "learning_rate": 6.1527819731829705e-06, "loss": 2.3069, "step": 4190500 }, { "epoch": 8.773068292192786, "grad_norm": 16.992000579833984, "learning_rate": 6.147537716692678e-06, "loss": 2.2877, "step": 4191000 }, { "epoch": 8.774114947918411, "grad_norm": 17.161273956298828, "learning_rate": 6.142293460202387e-06, "loss": 2.2925, "step": 4191500 }, { "epoch": 8.775161603644037, "grad_norm": 19.614173889160156, "learning_rate": 6.137049203712094e-06, "loss": 2.3267, "step": 4192000 }, { "epoch": 8.776208259369662, "grad_norm": 17.918930053710938, "learning_rate": 6.131804947221803e-06, "loss": 2.3063, "step": 4192500 }, { "epoch": 8.777254915095288, "grad_norm": 15.294358253479004, "learning_rate": 6.126560690731511e-06, "loss": 2.3073, "step": 4193000 }, { "epoch": 8.778301570820913, "grad_norm": 19.850801467895508, "learning_rate": 6.121316434241219e-06, "loss": 2.3118, "step": 4193500 }, { "epoch": 8.779348226546539, "grad_norm": 23.273210525512695, "learning_rate": 6.116072177750928e-06, "loss": 2.3041, "step": 4194000 }, { "epoch": 8.780394882272164, "grad_norm": 20.19832992553711, "learning_rate": 6.110827921260635e-06, "loss": 2.3083, "step": 4194500 }, { "epoch": 8.78144153799779, "grad_norm": 17.771894454956055, "learning_rate": 6.105583664770344e-06, "loss": 2.3184, "step": 4195000 }, { "epoch": 8.782488193723415, "grad_norm": 18.309429168701172, "learning_rate": 6.1003394082800525e-06, "loss": 2.3267, "step": 4195500 }, { "epoch": 8.78353484944904, "grad_norm": 18.181655883789062, "learning_rate": 6.09509515178976e-06, "loss": 2.3117, "step": 4196000 }, { "epoch": 8.784581505174666, "grad_norm": 19.31943702697754, "learning_rate": 6.089850895299469e-06, "loss": 2.3126, "step": 4196500 }, { "epoch": 8.785628160900291, "grad_norm": 16.91286277770996, "learning_rate": 6.084606638809176e-06, "loss": 2.317, "step": 4197000 }, { "epoch": 8.786674816625917, "grad_norm": 25.162508010864258, "learning_rate": 6.079362382318884e-06, "loss": 2.3113, "step": 4197500 }, { "epoch": 8.787721472351542, "grad_norm": 18.052978515625, "learning_rate": 6.074118125828593e-06, "loss": 2.3121, "step": 4198000 }, { "epoch": 8.788768128077168, "grad_norm": 18.67255401611328, "learning_rate": 6.068873869338301e-06, "loss": 2.2952, "step": 4198500 }, { "epoch": 8.789814783802793, "grad_norm": 17.447359085083008, "learning_rate": 6.063629612848009e-06, "loss": 2.2987, "step": 4199000 }, { "epoch": 8.790861439528419, "grad_norm": 22.78276252746582, "learning_rate": 6.058385356357717e-06, "loss": 2.3097, "step": 4199500 }, { "epoch": 8.791908095254044, "grad_norm": 17.83179473876953, "learning_rate": 6.053141099867426e-06, "loss": 2.3046, "step": 4200000 }, { "epoch": 8.79295475097967, "grad_norm": 21.184402465820312, "learning_rate": 6.047896843377134e-06, "loss": 2.329, "step": 4200500 }, { "epoch": 8.794001406705295, "grad_norm": 17.96770668029785, "learning_rate": 6.042652586886842e-06, "loss": 2.3095, "step": 4201000 }, { "epoch": 8.79504806243092, "grad_norm": 18.064889907836914, "learning_rate": 6.03740833039655e-06, "loss": 2.3031, "step": 4201500 }, { "epoch": 8.796094718156546, "grad_norm": 18.059553146362305, "learning_rate": 6.032164073906258e-06, "loss": 2.295, "step": 4202000 }, { "epoch": 8.797141373882171, "grad_norm": 20.636274337768555, "learning_rate": 6.026919817415966e-06, "loss": 2.3395, "step": 4202500 }, { "epoch": 8.798188029607797, "grad_norm": 18.824840545654297, "learning_rate": 6.021675560925675e-06, "loss": 2.2816, "step": 4203000 }, { "epoch": 8.799234685333422, "grad_norm": 19.31305694580078, "learning_rate": 6.016431304435382e-06, "loss": 2.3083, "step": 4203500 }, { "epoch": 8.800281341059048, "grad_norm": 22.112823486328125, "learning_rate": 6.011187047945091e-06, "loss": 2.2988, "step": 4204000 }, { "epoch": 8.801327996784673, "grad_norm": 18.365144729614258, "learning_rate": 6.0059427914547985e-06, "loss": 2.3018, "step": 4204500 }, { "epoch": 8.802374652510299, "grad_norm": 21.297645568847656, "learning_rate": 6.000698534964507e-06, "loss": 2.3037, "step": 4205000 }, { "epoch": 8.803421308235924, "grad_norm": 18.95937728881836, "learning_rate": 5.995454278474216e-06, "loss": 2.3053, "step": 4205500 }, { "epoch": 8.80446796396155, "grad_norm": 18.117006301879883, "learning_rate": 5.990210021983923e-06, "loss": 2.3054, "step": 4206000 }, { "epoch": 8.805514619687175, "grad_norm": 16.687786102294922, "learning_rate": 5.984965765493632e-06, "loss": 2.316, "step": 4206500 }, { "epoch": 8.8065612754128, "grad_norm": 17.2285099029541, "learning_rate": 5.97972150900334e-06, "loss": 2.3136, "step": 4207000 }, { "epoch": 8.807607931138426, "grad_norm": 15.66031265258789, "learning_rate": 5.974477252513048e-06, "loss": 2.3083, "step": 4207500 }, { "epoch": 8.808654586864051, "grad_norm": 18.904708862304688, "learning_rate": 5.969232996022757e-06, "loss": 2.3112, "step": 4208000 }, { "epoch": 8.809701242589677, "grad_norm": 20.080820083618164, "learning_rate": 5.963988739532464e-06, "loss": 2.3069, "step": 4208500 }, { "epoch": 8.810747898315302, "grad_norm": 19.721275329589844, "learning_rate": 5.958744483042172e-06, "loss": 2.3089, "step": 4209000 }, { "epoch": 8.811794554040928, "grad_norm": 19.577804565429688, "learning_rate": 5.9535002265518805e-06, "loss": 2.3328, "step": 4209500 }, { "epoch": 8.812841209766553, "grad_norm": 18.68763542175293, "learning_rate": 5.948255970061588e-06, "loss": 2.3206, "step": 4210000 }, { "epoch": 8.813887865492179, "grad_norm": 19.86869239807129, "learning_rate": 5.943011713571297e-06, "loss": 2.2917, "step": 4210500 }, { "epoch": 8.814934521217804, "grad_norm": 19.758291244506836, "learning_rate": 5.937767457081005e-06, "loss": 2.3162, "step": 4211000 }, { "epoch": 8.81598117694343, "grad_norm": 20.358779907226562, "learning_rate": 5.932523200590713e-06, "loss": 2.3186, "step": 4211500 }, { "epoch": 8.817027832669055, "grad_norm": 21.943870544433594, "learning_rate": 5.9272789441004215e-06, "loss": 2.284, "step": 4212000 }, { "epoch": 8.818074488394682, "grad_norm": 19.605070114135742, "learning_rate": 5.92203468761013e-06, "loss": 2.3093, "step": 4212500 }, { "epoch": 8.819121144120306, "grad_norm": 19.827709197998047, "learning_rate": 5.916790431119838e-06, "loss": 2.3071, "step": 4213000 }, { "epoch": 8.820167799845933, "grad_norm": 20.52306365966797, "learning_rate": 5.911546174629546e-06, "loss": 2.3048, "step": 4213500 }, { "epoch": 8.821214455571559, "grad_norm": 18.310649871826172, "learning_rate": 5.906301918139254e-06, "loss": 2.2847, "step": 4214000 }, { "epoch": 8.822261111297184, "grad_norm": 24.192655563354492, "learning_rate": 5.9010576616489625e-06, "loss": 2.3003, "step": 4214500 }, { "epoch": 8.82330776702281, "grad_norm": 17.823148727416992, "learning_rate": 5.89581340515867e-06, "loss": 2.2857, "step": 4215000 }, { "epoch": 8.824354422748435, "grad_norm": 16.6337833404541, "learning_rate": 5.890569148668379e-06, "loss": 2.2996, "step": 4215500 }, { "epoch": 8.82540107847406, "grad_norm": 22.20743751525879, "learning_rate": 5.8853248921780864e-06, "loss": 2.3027, "step": 4216000 }, { "epoch": 8.826447734199686, "grad_norm": 17.562686920166016, "learning_rate": 5.880080635687795e-06, "loss": 2.2979, "step": 4216500 }, { "epoch": 8.827494389925311, "grad_norm": 21.03754997253418, "learning_rate": 5.8748363791975035e-06, "loss": 2.2969, "step": 4217000 }, { "epoch": 8.828541045650937, "grad_norm": 16.955202102661133, "learning_rate": 5.869592122707211e-06, "loss": 2.321, "step": 4217500 }, { "epoch": 8.829587701376562, "grad_norm": 18.525869369506836, "learning_rate": 5.86434786621692e-06, "loss": 2.3068, "step": 4218000 }, { "epoch": 8.830634357102188, "grad_norm": 17.22722625732422, "learning_rate": 5.8591036097266274e-06, "loss": 2.3054, "step": 4218500 }, { "epoch": 8.831681012827813, "grad_norm": 20.039024353027344, "learning_rate": 5.853859353236336e-06, "loss": 2.3079, "step": 4219000 }, { "epoch": 8.832727668553439, "grad_norm": 22.45267677307129, "learning_rate": 5.8486150967460445e-06, "loss": 2.3347, "step": 4219500 }, { "epoch": 8.833774324279064, "grad_norm": 21.1621036529541, "learning_rate": 5.843370840255752e-06, "loss": 2.3039, "step": 4220000 }, { "epoch": 8.83482098000469, "grad_norm": 17.621618270874023, "learning_rate": 5.83812658376546e-06, "loss": 2.3145, "step": 4220500 }, { "epoch": 8.835867635730315, "grad_norm": 17.7886905670166, "learning_rate": 5.8328823272751684e-06, "loss": 2.2975, "step": 4221000 }, { "epoch": 8.83691429145594, "grad_norm": 20.852725982666016, "learning_rate": 5.827638070784876e-06, "loss": 2.3109, "step": 4221500 }, { "epoch": 8.837960947181566, "grad_norm": 16.282428741455078, "learning_rate": 5.822393814294585e-06, "loss": 2.2999, "step": 4222000 }, { "epoch": 8.839007602907191, "grad_norm": 19.231124877929688, "learning_rate": 5.817149557804293e-06, "loss": 2.3098, "step": 4222500 }, { "epoch": 8.840054258632817, "grad_norm": 18.116455078125, "learning_rate": 5.811905301314001e-06, "loss": 2.3075, "step": 4223000 }, { "epoch": 8.841100914358442, "grad_norm": 18.713956832885742, "learning_rate": 5.8066610448237094e-06, "loss": 2.3049, "step": 4223500 }, { "epoch": 8.842147570084068, "grad_norm": 19.696550369262695, "learning_rate": 5.801416788333418e-06, "loss": 2.2802, "step": 4224000 }, { "epoch": 8.843194225809693, "grad_norm": 17.62839698791504, "learning_rate": 5.796172531843126e-06, "loss": 2.2884, "step": 4224500 }, { "epoch": 8.844240881535319, "grad_norm": 19.887081146240234, "learning_rate": 5.790928275352834e-06, "loss": 2.302, "step": 4225000 }, { "epoch": 8.845287537260944, "grad_norm": 17.23154640197754, "learning_rate": 5.785684018862543e-06, "loss": 2.3241, "step": 4225500 }, { "epoch": 8.84633419298657, "grad_norm": 18.46407699584961, "learning_rate": 5.7804397623722504e-06, "loss": 2.303, "step": 4226000 }, { "epoch": 8.847380848712195, "grad_norm": 17.947011947631836, "learning_rate": 5.775195505881958e-06, "loss": 2.2829, "step": 4226500 }, { "epoch": 8.84842750443782, "grad_norm": 18.23003387451172, "learning_rate": 5.769951249391667e-06, "loss": 2.3017, "step": 4227000 }, { "epoch": 8.849474160163446, "grad_norm": 18.367843627929688, "learning_rate": 5.764706992901374e-06, "loss": 2.2994, "step": 4227500 }, { "epoch": 8.850520815889071, "grad_norm": 17.742454528808594, "learning_rate": 5.759462736411083e-06, "loss": 2.2934, "step": 4228000 }, { "epoch": 8.851567471614697, "grad_norm": 20.871034622192383, "learning_rate": 5.754218479920791e-06, "loss": 2.3056, "step": 4228500 }, { "epoch": 8.852614127340322, "grad_norm": 19.265594482421875, "learning_rate": 5.748974223430499e-06, "loss": 2.3112, "step": 4229000 }, { "epoch": 8.853660783065948, "grad_norm": 20.873071670532227, "learning_rate": 5.743729966940208e-06, "loss": 2.3025, "step": 4229500 }, { "epoch": 8.854707438791573, "grad_norm": 17.652376174926758, "learning_rate": 5.738485710449915e-06, "loss": 2.2875, "step": 4230000 }, { "epoch": 8.855754094517199, "grad_norm": 20.605777740478516, "learning_rate": 5.733241453959624e-06, "loss": 2.3187, "step": 4230500 }, { "epoch": 8.856800750242824, "grad_norm": 18.3855037689209, "learning_rate": 5.727997197469332e-06, "loss": 2.2925, "step": 4231000 }, { "epoch": 8.85784740596845, "grad_norm": 21.19786834716797, "learning_rate": 5.72275294097904e-06, "loss": 2.3184, "step": 4231500 }, { "epoch": 8.858894061694075, "grad_norm": 18.96221351623535, "learning_rate": 5.717508684488748e-06, "loss": 2.2977, "step": 4232000 }, { "epoch": 8.8599407174197, "grad_norm": 18.509183883666992, "learning_rate": 5.712264427998456e-06, "loss": 2.2941, "step": 4232500 }, { "epoch": 8.860987373145326, "grad_norm": 18.193941116333008, "learning_rate": 5.707020171508164e-06, "loss": 2.3078, "step": 4233000 }, { "epoch": 8.862034028870951, "grad_norm": 22.65190887451172, "learning_rate": 5.7017759150178726e-06, "loss": 2.3007, "step": 4233500 }, { "epoch": 8.863080684596577, "grad_norm": 20.994705200195312, "learning_rate": 5.696531658527581e-06, "loss": 2.2901, "step": 4234000 }, { "epoch": 8.864127340322202, "grad_norm": 22.22871971130371, "learning_rate": 5.691287402037289e-06, "loss": 2.3006, "step": 4234500 }, { "epoch": 8.865173996047828, "grad_norm": 18.252307891845703, "learning_rate": 5.686043145546997e-06, "loss": 2.2949, "step": 4235000 }, { "epoch": 8.866220651773453, "grad_norm": 22.336170196533203, "learning_rate": 5.680798889056706e-06, "loss": 2.2895, "step": 4235500 }, { "epoch": 8.867267307499079, "grad_norm": 21.47928810119629, "learning_rate": 5.6755546325664136e-06, "loss": 2.2968, "step": 4236000 }, { "epoch": 8.868313963224704, "grad_norm": 18.27018928527832, "learning_rate": 5.670310376076122e-06, "loss": 2.2986, "step": 4236500 }, { "epoch": 8.86936061895033, "grad_norm": 18.439189910888672, "learning_rate": 5.66506611958583e-06, "loss": 2.3007, "step": 4237000 }, { "epoch": 8.870407274675955, "grad_norm": 23.807998657226562, "learning_rate": 5.659821863095538e-06, "loss": 2.3086, "step": 4237500 }, { "epoch": 8.87145393040158, "grad_norm": 19.914262771606445, "learning_rate": 5.654577606605246e-06, "loss": 2.3045, "step": 4238000 }, { "epoch": 8.872500586127206, "grad_norm": 19.551259994506836, "learning_rate": 5.649333350114954e-06, "loss": 2.3175, "step": 4238500 }, { "epoch": 8.873547241852831, "grad_norm": 17.14333724975586, "learning_rate": 5.644089093624662e-06, "loss": 2.2966, "step": 4239000 }, { "epoch": 8.874593897578457, "grad_norm": 18.264629364013672, "learning_rate": 5.638844837134371e-06, "loss": 2.3097, "step": 4239500 }, { "epoch": 8.875640553304082, "grad_norm": 19.813312530517578, "learning_rate": 5.6336005806440785e-06, "loss": 2.2885, "step": 4240000 }, { "epoch": 8.876687209029708, "grad_norm": 22.491653442382812, "learning_rate": 5.628356324153787e-06, "loss": 2.2829, "step": 4240500 }, { "epoch": 8.877733864755333, "grad_norm": 19.31403160095215, "learning_rate": 5.6231120676634956e-06, "loss": 2.3036, "step": 4241000 }, { "epoch": 8.878780520480959, "grad_norm": 19.04964256286621, "learning_rate": 5.617867811173203e-06, "loss": 2.3039, "step": 4241500 }, { "epoch": 8.879827176206584, "grad_norm": 19.38524627685547, "learning_rate": 5.612623554682912e-06, "loss": 2.2764, "step": 4242000 }, { "epoch": 8.88087383193221, "grad_norm": 26.19223976135254, "learning_rate": 5.60737929819262e-06, "loss": 2.3097, "step": 4242500 }, { "epoch": 8.881920487657835, "grad_norm": 19.60045051574707, "learning_rate": 5.602135041702328e-06, "loss": 2.3139, "step": 4243000 }, { "epoch": 8.88296714338346, "grad_norm": 17.850086212158203, "learning_rate": 5.596890785212036e-06, "loss": 2.2818, "step": 4243500 }, { "epoch": 8.884013799109086, "grad_norm": 19.736125946044922, "learning_rate": 5.591646528721744e-06, "loss": 2.2915, "step": 4244000 }, { "epoch": 8.885060454834711, "grad_norm": 18.20684814453125, "learning_rate": 5.586402272231452e-06, "loss": 2.3004, "step": 4244500 }, { "epoch": 8.886107110560337, "grad_norm": 18.463163375854492, "learning_rate": 5.5811580157411605e-06, "loss": 2.3038, "step": 4245000 }, { "epoch": 8.887153766285962, "grad_norm": 17.140260696411133, "learning_rate": 5.575913759250868e-06, "loss": 2.3206, "step": 4245500 }, { "epoch": 8.888200422011588, "grad_norm": 17.707883834838867, "learning_rate": 5.570669502760577e-06, "loss": 2.3007, "step": 4246000 }, { "epoch": 8.889247077737213, "grad_norm": 21.040576934814453, "learning_rate": 5.565425246270285e-06, "loss": 2.3064, "step": 4246500 }, { "epoch": 8.89029373346284, "grad_norm": 23.505247116088867, "learning_rate": 5.560180989779993e-06, "loss": 2.329, "step": 4247000 }, { "epoch": 8.891340389188464, "grad_norm": 19.304126739501953, "learning_rate": 5.5549367332897015e-06, "loss": 2.2884, "step": 4247500 }, { "epoch": 8.892387044914091, "grad_norm": 19.00141716003418, "learning_rate": 5.54969247679941e-06, "loss": 2.2881, "step": 4248000 }, { "epoch": 8.893433700639715, "grad_norm": 20.193212509155273, "learning_rate": 5.544448220309118e-06, "loss": 2.313, "step": 4248500 }, { "epoch": 8.894480356365342, "grad_norm": 15.802923202514648, "learning_rate": 5.539203963818826e-06, "loss": 2.3066, "step": 4249000 }, { "epoch": 8.895527012090968, "grad_norm": 17.659786224365234, "learning_rate": 5.533959707328534e-06, "loss": 2.306, "step": 4249500 }, { "epoch": 8.896573667816593, "grad_norm": 17.8280029296875, "learning_rate": 5.528715450838242e-06, "loss": 2.2853, "step": 4250000 }, { "epoch": 8.897620323542219, "grad_norm": 24.4609432220459, "learning_rate": 5.52347119434795e-06, "loss": 2.3038, "step": 4250500 }, { "epoch": 8.898666979267844, "grad_norm": 22.500707626342773, "learning_rate": 5.518226937857659e-06, "loss": 2.3033, "step": 4251000 }, { "epoch": 8.89971363499347, "grad_norm": 21.109355926513672, "learning_rate": 5.512982681367366e-06, "loss": 2.2972, "step": 4251500 }, { "epoch": 8.900760290719095, "grad_norm": 20.001708984375, "learning_rate": 5.507738424877075e-06, "loss": 2.3296, "step": 4252000 }, { "epoch": 8.90180694644472, "grad_norm": 15.954595565795898, "learning_rate": 5.5024941683867835e-06, "loss": 2.2966, "step": 4252500 }, { "epoch": 8.902853602170346, "grad_norm": 21.33171272277832, "learning_rate": 5.497249911896491e-06, "loss": 2.3003, "step": 4253000 }, { "epoch": 8.903900257895971, "grad_norm": 18.865453720092773, "learning_rate": 5.4920056554062e-06, "loss": 2.2908, "step": 4253500 }, { "epoch": 8.904946913621597, "grad_norm": 17.179943084716797, "learning_rate": 5.486761398915907e-06, "loss": 2.3014, "step": 4254000 }, { "epoch": 8.905993569347222, "grad_norm": 22.76430892944336, "learning_rate": 5.481517142425616e-06, "loss": 2.3021, "step": 4254500 }, { "epoch": 8.907040225072848, "grad_norm": 20.274490356445312, "learning_rate": 5.4762728859353245e-06, "loss": 2.3017, "step": 4255000 }, { "epoch": 8.908086880798473, "grad_norm": 21.623069763183594, "learning_rate": 5.471028629445032e-06, "loss": 2.3124, "step": 4255500 }, { "epoch": 8.909133536524099, "grad_norm": 16.659828186035156, "learning_rate": 5.46578437295474e-06, "loss": 2.2862, "step": 4256000 }, { "epoch": 8.910180192249724, "grad_norm": 17.901853561401367, "learning_rate": 5.460540116464448e-06, "loss": 2.3186, "step": 4256500 }, { "epoch": 8.91122684797535, "grad_norm": 18.027847290039062, "learning_rate": 5.455295859974156e-06, "loss": 2.3134, "step": 4257000 }, { "epoch": 8.912273503700975, "grad_norm": 19.370813369750977, "learning_rate": 5.450051603483865e-06, "loss": 2.3081, "step": 4257500 }, { "epoch": 8.9133201594266, "grad_norm": 23.148286819458008, "learning_rate": 5.444807346993573e-06, "loss": 2.3188, "step": 4258000 }, { "epoch": 8.914366815152226, "grad_norm": 20.3751220703125, "learning_rate": 5.439563090503281e-06, "loss": 2.2923, "step": 4258500 }, { "epoch": 8.915413470877851, "grad_norm": 23.81209945678711, "learning_rate": 5.434318834012989e-06, "loss": 2.296, "step": 4259000 }, { "epoch": 8.916460126603477, "grad_norm": 22.562116622924805, "learning_rate": 5.429074577522698e-06, "loss": 2.3161, "step": 4259500 }, { "epoch": 8.917506782329102, "grad_norm": 18.21100616455078, "learning_rate": 5.423830321032406e-06, "loss": 2.3045, "step": 4260000 }, { "epoch": 8.918553438054728, "grad_norm": 17.977041244506836, "learning_rate": 5.418586064542114e-06, "loss": 2.2809, "step": 4260500 }, { "epoch": 8.919600093780353, "grad_norm": 27.502973556518555, "learning_rate": 5.413341808051822e-06, "loss": 2.3013, "step": 4261000 }, { "epoch": 8.920646749505979, "grad_norm": 20.03350257873535, "learning_rate": 5.4080975515615295e-06, "loss": 2.311, "step": 4261500 }, { "epoch": 8.921693405231604, "grad_norm": 18.25152587890625, "learning_rate": 5.402853295071238e-06, "loss": 2.3104, "step": 4262000 }, { "epoch": 8.92274006095723, "grad_norm": 19.803997039794922, "learning_rate": 5.397609038580946e-06, "loss": 2.309, "step": 4262500 }, { "epoch": 8.923786716682855, "grad_norm": 23.300172805786133, "learning_rate": 5.392364782090654e-06, "loss": 2.2898, "step": 4263000 }, { "epoch": 8.92483337240848, "grad_norm": 24.358657836914062, "learning_rate": 5.387120525600363e-06, "loss": 2.31, "step": 4263500 }, { "epoch": 8.925880028134106, "grad_norm": 16.68951988220215, "learning_rate": 5.3818762691100705e-06, "loss": 2.3002, "step": 4264000 }, { "epoch": 8.926926683859731, "grad_norm": 18.21406364440918, "learning_rate": 5.376632012619779e-06, "loss": 2.3106, "step": 4264500 }, { "epoch": 8.927973339585357, "grad_norm": 19.0594539642334, "learning_rate": 5.371387756129488e-06, "loss": 2.319, "step": 4265000 }, { "epoch": 8.929019995310982, "grad_norm": 18.501020431518555, "learning_rate": 5.366143499639195e-06, "loss": 2.3155, "step": 4265500 }, { "epoch": 8.930066651036608, "grad_norm": 17.113616943359375, "learning_rate": 5.360899243148904e-06, "loss": 2.2916, "step": 4266000 }, { "epoch": 8.931113306762233, "grad_norm": 18.248201370239258, "learning_rate": 5.355654986658612e-06, "loss": 2.291, "step": 4266500 }, { "epoch": 8.932159962487859, "grad_norm": 17.779714584350586, "learning_rate": 5.35041073016832e-06, "loss": 2.2956, "step": 4267000 }, { "epoch": 8.933206618213484, "grad_norm": 16.79267120361328, "learning_rate": 5.345166473678028e-06, "loss": 2.2913, "step": 4267500 }, { "epoch": 8.93425327393911, "grad_norm": 20.967573165893555, "learning_rate": 5.339922217187736e-06, "loss": 2.2956, "step": 4268000 }, { "epoch": 8.935299929664735, "grad_norm": 21.251697540283203, "learning_rate": 5.334677960697444e-06, "loss": 2.3149, "step": 4268500 }, { "epoch": 8.93634658539036, "grad_norm": 18.292320251464844, "learning_rate": 5.3294337042071525e-06, "loss": 2.2928, "step": 4269000 }, { "epoch": 8.937393241115986, "grad_norm": 23.004106521606445, "learning_rate": 5.324189447716861e-06, "loss": 2.2976, "step": 4269500 }, { "epoch": 8.938439896841611, "grad_norm": 18.544830322265625, "learning_rate": 5.318945191226569e-06, "loss": 2.2846, "step": 4270000 }, { "epoch": 8.939486552567237, "grad_norm": 18.691329956054688, "learning_rate": 5.313700934736277e-06, "loss": 2.2978, "step": 4270500 }, { "epoch": 8.940533208292862, "grad_norm": 24.659095764160156, "learning_rate": 5.308456678245985e-06, "loss": 2.289, "step": 4271000 }, { "epoch": 8.941579864018488, "grad_norm": 18.362558364868164, "learning_rate": 5.3032124217556935e-06, "loss": 2.3181, "step": 4271500 }, { "epoch": 8.942626519744113, "grad_norm": 16.079833984375, "learning_rate": 5.297968165265402e-06, "loss": 2.2988, "step": 4272000 }, { "epoch": 8.943673175469739, "grad_norm": 18.0926456451416, "learning_rate": 5.29272390877511e-06, "loss": 2.3157, "step": 4272500 }, { "epoch": 8.944719831195364, "grad_norm": 21.127552032470703, "learning_rate": 5.2874796522848174e-06, "loss": 2.2919, "step": 4273000 }, { "epoch": 8.94576648692099, "grad_norm": 22.57141876220703, "learning_rate": 5.282235395794526e-06, "loss": 2.327, "step": 4273500 }, { "epoch": 8.946813142646615, "grad_norm": 18.023120880126953, "learning_rate": 5.276991139304234e-06, "loss": 2.2849, "step": 4274000 }, { "epoch": 8.94785979837224, "grad_norm": 19.4417724609375, "learning_rate": 5.271746882813942e-06, "loss": 2.3228, "step": 4274500 }, { "epoch": 8.948906454097866, "grad_norm": 19.600759506225586, "learning_rate": 5.266502626323651e-06, "loss": 2.2818, "step": 4275000 }, { "epoch": 8.949953109823491, "grad_norm": 21.35887908935547, "learning_rate": 5.2612583698333584e-06, "loss": 2.31, "step": 4275500 }, { "epoch": 8.950999765549117, "grad_norm": 19.90021514892578, "learning_rate": 5.256014113343067e-06, "loss": 2.3111, "step": 4276000 }, { "epoch": 8.952046421274742, "grad_norm": 25.085905075073242, "learning_rate": 5.2507698568527755e-06, "loss": 2.284, "step": 4276500 }, { "epoch": 8.953093077000368, "grad_norm": 16.992042541503906, "learning_rate": 5.245525600362483e-06, "loss": 2.3176, "step": 4277000 }, { "epoch": 8.954139732725993, "grad_norm": 20.22222137451172, "learning_rate": 5.240281343872192e-06, "loss": 2.3153, "step": 4277500 }, { "epoch": 8.955186388451619, "grad_norm": 19.6953125, "learning_rate": 5.2350370873819e-06, "loss": 2.3143, "step": 4278000 }, { "epoch": 8.956233044177244, "grad_norm": 19.372085571289062, "learning_rate": 5.229792830891608e-06, "loss": 2.284, "step": 4278500 }, { "epoch": 8.95727969990287, "grad_norm": 28.00524139404297, "learning_rate": 5.224548574401316e-06, "loss": 2.297, "step": 4279000 }, { "epoch": 8.958326355628495, "grad_norm": 20.305017471313477, "learning_rate": 5.219304317911023e-06, "loss": 2.292, "step": 4279500 }, { "epoch": 8.95937301135412, "grad_norm": 22.454757690429688, "learning_rate": 5.214060061420732e-06, "loss": 2.2836, "step": 4280000 }, { "epoch": 8.960419667079746, "grad_norm": 21.43400764465332, "learning_rate": 5.2088158049304404e-06, "loss": 2.2895, "step": 4280500 }, { "epoch": 8.961466322805371, "grad_norm": 22.81983184814453, "learning_rate": 5.203571548440148e-06, "loss": 2.3012, "step": 4281000 }, { "epoch": 8.962512978530997, "grad_norm": 16.05020523071289, "learning_rate": 5.198327291949857e-06, "loss": 2.2928, "step": 4281500 }, { "epoch": 8.963559634256622, "grad_norm": 15.743025779724121, "learning_rate": 5.193083035459565e-06, "loss": 2.271, "step": 4282000 }, { "epoch": 8.96460628998225, "grad_norm": 16.795549392700195, "learning_rate": 5.187838778969273e-06, "loss": 2.3213, "step": 4282500 }, { "epoch": 8.965652945707873, "grad_norm": 23.33873176574707, "learning_rate": 5.1825945224789814e-06, "loss": 2.3158, "step": 4283000 }, { "epoch": 8.9666996014335, "grad_norm": 19.5621280670166, "learning_rate": 5.17735026598869e-06, "loss": 2.3088, "step": 4283500 }, { "epoch": 8.967746257159126, "grad_norm": 19.635358810424805, "learning_rate": 5.172106009498398e-06, "loss": 2.2866, "step": 4284000 }, { "epoch": 8.968792912884751, "grad_norm": 20.01838493347168, "learning_rate": 5.166861753008106e-06, "loss": 2.3042, "step": 4284500 }, { "epoch": 8.969839568610377, "grad_norm": 21.830650329589844, "learning_rate": 5.161617496517814e-06, "loss": 2.3004, "step": 4285000 }, { "epoch": 8.970886224336002, "grad_norm": 19.20089340209961, "learning_rate": 5.156373240027522e-06, "loss": 2.308, "step": 4285500 }, { "epoch": 8.971932880061628, "grad_norm": 19.50883674621582, "learning_rate": 5.15112898353723e-06, "loss": 2.3047, "step": 4286000 }, { "epoch": 8.972979535787253, "grad_norm": 18.469518661499023, "learning_rate": 5.145884727046939e-06, "loss": 2.2922, "step": 4286500 }, { "epoch": 8.974026191512879, "grad_norm": 20.002578735351562, "learning_rate": 5.140640470556646e-06, "loss": 2.3074, "step": 4287000 }, { "epoch": 8.975072847238504, "grad_norm": 18.569232940673828, "learning_rate": 5.135396214066355e-06, "loss": 2.3144, "step": 4287500 }, { "epoch": 8.97611950296413, "grad_norm": 20.244888305664062, "learning_rate": 5.130151957576063e-06, "loss": 2.2919, "step": 4288000 }, { "epoch": 8.977166158689755, "grad_norm": 21.897735595703125, "learning_rate": 5.124907701085771e-06, "loss": 2.3078, "step": 4288500 }, { "epoch": 8.97821281441538, "grad_norm": 19.357309341430664, "learning_rate": 5.11966344459548e-06, "loss": 2.3046, "step": 4289000 }, { "epoch": 8.979259470141006, "grad_norm": 21.572383880615234, "learning_rate": 5.114419188105187e-06, "loss": 2.289, "step": 4289500 }, { "epoch": 8.980306125866631, "grad_norm": 19.590198516845703, "learning_rate": 5.109174931614896e-06, "loss": 2.3011, "step": 4290000 }, { "epoch": 8.981352781592257, "grad_norm": 18.875316619873047, "learning_rate": 5.103930675124604e-06, "loss": 2.2971, "step": 4290500 }, { "epoch": 8.982399437317882, "grad_norm": 24.303314208984375, "learning_rate": 5.098686418634311e-06, "loss": 2.2966, "step": 4291000 }, { "epoch": 8.983446093043508, "grad_norm": 20.60466194152832, "learning_rate": 5.09344216214402e-06, "loss": 2.2956, "step": 4291500 }, { "epoch": 8.984492748769133, "grad_norm": 19.361888885498047, "learning_rate": 5.088197905653728e-06, "loss": 2.2908, "step": 4292000 }, { "epoch": 8.985539404494759, "grad_norm": 22.197267532348633, "learning_rate": 5.082953649163436e-06, "loss": 2.2764, "step": 4292500 }, { "epoch": 8.986586060220384, "grad_norm": 20.2824649810791, "learning_rate": 5.077709392673145e-06, "loss": 2.2807, "step": 4293000 }, { "epoch": 8.98763271594601, "grad_norm": 19.101011276245117, "learning_rate": 5.072465136182853e-06, "loss": 2.297, "step": 4293500 }, { "epoch": 8.988679371671635, "grad_norm": 14.83177375793457, "learning_rate": 5.067220879692561e-06, "loss": 2.3054, "step": 4294000 }, { "epoch": 8.98972602739726, "grad_norm": 22.83993148803711, "learning_rate": 5.061976623202269e-06, "loss": 2.292, "step": 4294500 }, { "epoch": 8.990772683122886, "grad_norm": 19.57524299621582, "learning_rate": 5.056732366711978e-06, "loss": 2.293, "step": 4295000 }, { "epoch": 8.991819338848511, "grad_norm": 19.299161911010742, "learning_rate": 5.0514881102216856e-06, "loss": 2.2979, "step": 4295500 }, { "epoch": 8.992865994574137, "grad_norm": 23.872255325317383, "learning_rate": 5.046243853731394e-06, "loss": 2.3035, "step": 4296000 }, { "epoch": 8.993912650299762, "grad_norm": 17.215818405151367, "learning_rate": 5.040999597241102e-06, "loss": 2.2758, "step": 4296500 }, { "epoch": 8.994959306025388, "grad_norm": 17.812469482421875, "learning_rate": 5.0357553407508095e-06, "loss": 2.293, "step": 4297000 }, { "epoch": 8.996005961751013, "grad_norm": 21.998165130615234, "learning_rate": 5.030511084260518e-06, "loss": 2.3121, "step": 4297500 }, { "epoch": 8.997052617476639, "grad_norm": 16.391664505004883, "learning_rate": 5.025266827770226e-06, "loss": 2.3018, "step": 4298000 }, { "epoch": 8.998099273202264, "grad_norm": 19.343645095825195, "learning_rate": 5.020022571279934e-06, "loss": 2.3054, "step": 4298500 }, { "epoch": 8.99914592892789, "grad_norm": 19.29079818725586, "learning_rate": 5.014778314789643e-06, "loss": 2.3091, "step": 4299000 }, { "epoch": 9.000192584653515, "grad_norm": 20.344707489013672, "learning_rate": 5.0095340582993505e-06, "loss": 2.2893, "step": 4299500 }, { "epoch": 9.00123924037914, "grad_norm": 19.83376121520996, "learning_rate": 5.004289801809059e-06, "loss": 2.3014, "step": 4300000 }, { "epoch": 9.002285896104766, "grad_norm": 20.96561050415039, "learning_rate": 4.9990455453187676e-06, "loss": 2.2923, "step": 4300500 }, { "epoch": 9.003332551830391, "grad_norm": 19.091760635375977, "learning_rate": 4.993801288828475e-06, "loss": 2.2847, "step": 4301000 }, { "epoch": 9.004379207556017, "grad_norm": 21.40865135192871, "learning_rate": 4.988557032338184e-06, "loss": 2.2822, "step": 4301500 }, { "epoch": 9.005425863281642, "grad_norm": 19.937698364257812, "learning_rate": 4.9833127758478915e-06, "loss": 2.28, "step": 4302000 }, { "epoch": 9.006472519007268, "grad_norm": 18.084285736083984, "learning_rate": 4.978068519357599e-06, "loss": 2.2782, "step": 4302500 }, { "epoch": 9.007519174732893, "grad_norm": 27.04085350036621, "learning_rate": 4.972824262867308e-06, "loss": 2.2964, "step": 4303000 }, { "epoch": 9.008565830458519, "grad_norm": 19.113935470581055, "learning_rate": 4.967580006377016e-06, "loss": 2.2892, "step": 4303500 }, { "epoch": 9.009612486184144, "grad_norm": 22.8785400390625, "learning_rate": 4.962335749886724e-06, "loss": 2.2982, "step": 4304000 }, { "epoch": 9.01065914190977, "grad_norm": 17.910934448242188, "learning_rate": 4.9570914933964325e-06, "loss": 2.3199, "step": 4304500 }, { "epoch": 9.011705797635395, "grad_norm": 18.798582077026367, "learning_rate": 4.951847236906141e-06, "loss": 2.2885, "step": 4305000 }, { "epoch": 9.01275245336102, "grad_norm": 19.274295806884766, "learning_rate": 4.946602980415849e-06, "loss": 2.2972, "step": 4305500 }, { "epoch": 9.013799109086646, "grad_norm": 22.498525619506836, "learning_rate": 4.941358723925557e-06, "loss": 2.2815, "step": 4306000 }, { "epoch": 9.014845764812272, "grad_norm": 20.501401901245117, "learning_rate": 4.936114467435265e-06, "loss": 2.3007, "step": 4306500 }, { "epoch": 9.015892420537897, "grad_norm": 18.64659309387207, "learning_rate": 4.9308702109449735e-06, "loss": 2.2856, "step": 4307000 }, { "epoch": 9.016939076263522, "grad_norm": 23.750303268432617, "learning_rate": 4.925625954454682e-06, "loss": 2.2851, "step": 4307500 }, { "epoch": 9.017985731989148, "grad_norm": 18.76752281188965, "learning_rate": 4.92038169796439e-06, "loss": 2.2866, "step": 4308000 }, { "epoch": 9.019032387714773, "grad_norm": 18.879972457885742, "learning_rate": 4.915137441474097e-06, "loss": 2.2759, "step": 4308500 }, { "epoch": 9.020079043440399, "grad_norm": 17.415912628173828, "learning_rate": 4.909893184983806e-06, "loss": 2.2954, "step": 4309000 }, { "epoch": 9.021125699166024, "grad_norm": 19.326236724853516, "learning_rate": 4.904648928493514e-06, "loss": 2.2888, "step": 4309500 }, { "epoch": 9.02217235489165, "grad_norm": 23.085617065429688, "learning_rate": 4.899404672003222e-06, "loss": 2.3122, "step": 4310000 }, { "epoch": 9.023219010617275, "grad_norm": 15.627378463745117, "learning_rate": 4.894160415512931e-06, "loss": 2.2885, "step": 4310500 }, { "epoch": 9.0242656663429, "grad_norm": 17.049205780029297, "learning_rate": 4.888916159022638e-06, "loss": 2.274, "step": 4311000 }, { "epoch": 9.025312322068526, "grad_norm": 21.7581729888916, "learning_rate": 4.883671902532347e-06, "loss": 2.2819, "step": 4311500 }, { "epoch": 9.026358977794152, "grad_norm": 19.245738983154297, "learning_rate": 4.8784276460420555e-06, "loss": 2.2856, "step": 4312000 }, { "epoch": 9.027405633519777, "grad_norm": 21.05806541442871, "learning_rate": 4.873183389551763e-06, "loss": 2.2966, "step": 4312500 }, { "epoch": 9.028452289245402, "grad_norm": 18.948514938354492, "learning_rate": 4.867939133061472e-06, "loss": 2.309, "step": 4313000 }, { "epoch": 9.029498944971028, "grad_norm": 21.66029167175293, "learning_rate": 4.862694876571179e-06, "loss": 2.2899, "step": 4313500 }, { "epoch": 9.030545600696653, "grad_norm": 19.949201583862305, "learning_rate": 4.857450620080888e-06, "loss": 2.2918, "step": 4314000 }, { "epoch": 9.031592256422279, "grad_norm": 17.264755249023438, "learning_rate": 4.852206363590596e-06, "loss": 2.2836, "step": 4314500 }, { "epoch": 9.032638912147904, "grad_norm": 18.60687255859375, "learning_rate": 4.846962107100303e-06, "loss": 2.2872, "step": 4315000 }, { "epoch": 9.03368556787353, "grad_norm": 19.70314598083496, "learning_rate": 4.841717850610012e-06, "loss": 2.2879, "step": 4315500 }, { "epoch": 9.034732223599155, "grad_norm": 18.87236785888672, "learning_rate": 4.83647359411972e-06, "loss": 2.2781, "step": 4316000 }, { "epoch": 9.03577887932478, "grad_norm": 21.025754928588867, "learning_rate": 4.831229337629428e-06, "loss": 2.3009, "step": 4316500 }, { "epoch": 9.036825535050406, "grad_norm": 21.03815269470215, "learning_rate": 4.825985081139137e-06, "loss": 2.3135, "step": 4317000 }, { "epoch": 9.037872190776032, "grad_norm": 20.71939468383789, "learning_rate": 4.820740824648845e-06, "loss": 2.2843, "step": 4317500 }, { "epoch": 9.038918846501659, "grad_norm": 19.819143295288086, "learning_rate": 4.815496568158553e-06, "loss": 2.2873, "step": 4318000 }, { "epoch": 9.039965502227284, "grad_norm": 18.161880493164062, "learning_rate": 4.810252311668261e-06, "loss": 2.2994, "step": 4318500 }, { "epoch": 9.04101215795291, "grad_norm": 21.15572738647461, "learning_rate": 4.80500805517797e-06, "loss": 2.2883, "step": 4319000 }, { "epoch": 9.042058813678535, "grad_norm": 19.200212478637695, "learning_rate": 4.799763798687678e-06, "loss": 2.2922, "step": 4319500 }, { "epoch": 9.04310546940416, "grad_norm": 21.146820068359375, "learning_rate": 4.794519542197385e-06, "loss": 2.2881, "step": 4320000 }, { "epoch": 9.044152125129786, "grad_norm": 22.056081771850586, "learning_rate": 4.789275285707094e-06, "loss": 2.2897, "step": 4320500 }, { "epoch": 9.045198780855412, "grad_norm": 21.167526245117188, "learning_rate": 4.7840310292168015e-06, "loss": 2.3107, "step": 4321000 }, { "epoch": 9.046245436581037, "grad_norm": 19.745420455932617, "learning_rate": 4.77878677272651e-06, "loss": 2.2968, "step": 4321500 }, { "epoch": 9.047292092306662, "grad_norm": 19.240036010742188, "learning_rate": 4.773542516236219e-06, "loss": 2.2902, "step": 4322000 }, { "epoch": 9.048338748032288, "grad_norm": 19.713485717773438, "learning_rate": 4.768298259745926e-06, "loss": 2.2954, "step": 4322500 }, { "epoch": 9.049385403757913, "grad_norm": 20.760332107543945, "learning_rate": 4.763054003255635e-06, "loss": 2.2811, "step": 4323000 }, { "epoch": 9.050432059483539, "grad_norm": 39.34925079345703, "learning_rate": 4.7578097467653425e-06, "loss": 2.2923, "step": 4323500 }, { "epoch": 9.051478715209164, "grad_norm": 24.98177146911621, "learning_rate": 4.752565490275051e-06, "loss": 2.276, "step": 4324000 }, { "epoch": 9.05252537093479, "grad_norm": 17.80513572692871, "learning_rate": 4.74732123378476e-06, "loss": 2.2766, "step": 4324500 }, { "epoch": 9.053572026660415, "grad_norm": 18.10126495361328, "learning_rate": 4.742076977294467e-06, "loss": 2.3029, "step": 4325000 }, { "epoch": 9.05461868238604, "grad_norm": 20.719541549682617, "learning_rate": 4.736832720804176e-06, "loss": 2.3042, "step": 4325500 }, { "epoch": 9.055665338111666, "grad_norm": 16.001399993896484, "learning_rate": 4.7315884643138835e-06, "loss": 2.2988, "step": 4326000 }, { "epoch": 9.056711993837292, "grad_norm": 18.43076515197754, "learning_rate": 4.726344207823591e-06, "loss": 2.3042, "step": 4326500 }, { "epoch": 9.057758649562917, "grad_norm": 22.33717155456543, "learning_rate": 4.7210999513333e-06, "loss": 2.2934, "step": 4327000 }, { "epoch": 9.058805305288542, "grad_norm": 23.228967666625977, "learning_rate": 4.715855694843008e-06, "loss": 2.3033, "step": 4327500 }, { "epoch": 9.059851961014168, "grad_norm": 21.827714920043945, "learning_rate": 4.710611438352716e-06, "loss": 2.3007, "step": 4328000 }, { "epoch": 9.060898616739793, "grad_norm": 15.316986083984375, "learning_rate": 4.7053671818624245e-06, "loss": 2.3075, "step": 4328500 }, { "epoch": 9.061945272465419, "grad_norm": 24.72755241394043, "learning_rate": 4.700122925372133e-06, "loss": 2.2889, "step": 4329000 }, { "epoch": 9.062991928191044, "grad_norm": 19.870220184326172, "learning_rate": 4.694878668881841e-06, "loss": 2.2991, "step": 4329500 }, { "epoch": 9.06403858391667, "grad_norm": 16.323768615722656, "learning_rate": 4.689634412391549e-06, "loss": 2.291, "step": 4330000 }, { "epoch": 9.065085239642295, "grad_norm": 20.59893226623535, "learning_rate": 4.684390155901258e-06, "loss": 2.2902, "step": 4330500 }, { "epoch": 9.06613189536792, "grad_norm": 16.252361297607422, "learning_rate": 4.6791458994109655e-06, "loss": 2.2969, "step": 4331000 }, { "epoch": 9.067178551093546, "grad_norm": 17.501554489135742, "learning_rate": 4.673901642920673e-06, "loss": 2.3087, "step": 4331500 }, { "epoch": 9.068225206819172, "grad_norm": 21.232688903808594, "learning_rate": 4.668657386430381e-06, "loss": 2.2865, "step": 4332000 }, { "epoch": 9.069271862544797, "grad_norm": 18.289535522460938, "learning_rate": 4.6634131299400895e-06, "loss": 2.2954, "step": 4332500 }, { "epoch": 9.070318518270422, "grad_norm": 20.480634689331055, "learning_rate": 4.658168873449798e-06, "loss": 2.2873, "step": 4333000 }, { "epoch": 9.071365173996048, "grad_norm": 21.134143829345703, "learning_rate": 4.652924616959506e-06, "loss": 2.2826, "step": 4333500 }, { "epoch": 9.072411829721673, "grad_norm": 18.65264320373535, "learning_rate": 4.647680360469214e-06, "loss": 2.3008, "step": 4334000 }, { "epoch": 9.073458485447299, "grad_norm": 20.466651916503906, "learning_rate": 4.642436103978923e-06, "loss": 2.2911, "step": 4334500 }, { "epoch": 9.074505141172924, "grad_norm": 19.236343383789062, "learning_rate": 4.6371918474886304e-06, "loss": 2.2768, "step": 4335000 }, { "epoch": 9.07555179689855, "grad_norm": 19.250194549560547, "learning_rate": 4.631947590998339e-06, "loss": 2.2966, "step": 4335500 }, { "epoch": 9.076598452624175, "grad_norm": 20.573598861694336, "learning_rate": 4.6267033345080475e-06, "loss": 2.3203, "step": 4336000 }, { "epoch": 9.0776451083498, "grad_norm": 19.900768280029297, "learning_rate": 4.621459078017755e-06, "loss": 2.3009, "step": 4336500 }, { "epoch": 9.078691764075426, "grad_norm": 19.14173126220703, "learning_rate": 4.616214821527464e-06, "loss": 2.309, "step": 4337000 }, { "epoch": 9.079738419801052, "grad_norm": 20.106908798217773, "learning_rate": 4.6109705650371714e-06, "loss": 2.2766, "step": 4337500 }, { "epoch": 9.080785075526677, "grad_norm": 18.09816551208496, "learning_rate": 4.605726308546879e-06, "loss": 2.302, "step": 4338000 }, { "epoch": 9.081831731252302, "grad_norm": 20.107515335083008, "learning_rate": 4.600482052056588e-06, "loss": 2.2914, "step": 4338500 }, { "epoch": 9.082878386977928, "grad_norm": 17.578861236572266, "learning_rate": 4.595237795566296e-06, "loss": 2.3182, "step": 4339000 }, { "epoch": 9.083925042703553, "grad_norm": 22.669721603393555, "learning_rate": 4.589993539076004e-06, "loss": 2.2945, "step": 4339500 }, { "epoch": 9.084971698429179, "grad_norm": 22.788314819335938, "learning_rate": 4.5847492825857124e-06, "loss": 2.3001, "step": 4340000 }, { "epoch": 9.086018354154804, "grad_norm": 20.60722541809082, "learning_rate": 4.57950502609542e-06, "loss": 2.3025, "step": 4340500 }, { "epoch": 9.08706500988043, "grad_norm": 20.01075553894043, "learning_rate": 4.574260769605129e-06, "loss": 2.2902, "step": 4341000 }, { "epoch": 9.088111665606055, "grad_norm": 17.518911361694336, "learning_rate": 4.569016513114837e-06, "loss": 2.2936, "step": 4341500 }, { "epoch": 9.08915832133168, "grad_norm": 19.45370864868164, "learning_rate": 4.563772256624545e-06, "loss": 2.2993, "step": 4342000 }, { "epoch": 9.090204977057306, "grad_norm": 18.715871810913086, "learning_rate": 4.5585280001342534e-06, "loss": 2.2828, "step": 4342500 }, { "epoch": 9.091251632782932, "grad_norm": 23.89344596862793, "learning_rate": 4.553283743643961e-06, "loss": 2.2856, "step": 4343000 }, { "epoch": 9.092298288508557, "grad_norm": 22.741424560546875, "learning_rate": 4.54803948715367e-06, "loss": 2.313, "step": 4343500 }, { "epoch": 9.093344944234182, "grad_norm": 20.53956413269043, "learning_rate": 4.542795230663377e-06, "loss": 2.2813, "step": 4344000 }, { "epoch": 9.094391599959808, "grad_norm": 20.02401351928711, "learning_rate": 4.537550974173086e-06, "loss": 2.3016, "step": 4344500 }, { "epoch": 9.095438255685433, "grad_norm": 23.101900100708008, "learning_rate": 4.532306717682794e-06, "loss": 2.2894, "step": 4345000 }, { "epoch": 9.096484911411059, "grad_norm": 20.906280517578125, "learning_rate": 4.527062461192502e-06, "loss": 2.2925, "step": 4345500 }, { "epoch": 9.097531567136684, "grad_norm": 20.439104080200195, "learning_rate": 4.521818204702211e-06, "loss": 2.2922, "step": 4346000 }, { "epoch": 9.09857822286231, "grad_norm": 20.955739974975586, "learning_rate": 4.516573948211918e-06, "loss": 2.2889, "step": 4346500 }, { "epoch": 9.099624878587935, "grad_norm": 31.504236221313477, "learning_rate": 4.511329691721627e-06, "loss": 2.3005, "step": 4347000 }, { "epoch": 9.10067153431356, "grad_norm": 18.495431900024414, "learning_rate": 4.5060854352313354e-06, "loss": 2.3025, "step": 4347500 }, { "epoch": 9.101718190039186, "grad_norm": 19.37330436706543, "learning_rate": 4.500841178741043e-06, "loss": 2.3003, "step": 4348000 }, { "epoch": 9.102764845764812, "grad_norm": 25.741924285888672, "learning_rate": 4.495596922250752e-06, "loss": 2.2819, "step": 4348500 }, { "epoch": 9.103811501490437, "grad_norm": 18.525333404541016, "learning_rate": 4.490352665760459e-06, "loss": 2.2874, "step": 4349000 }, { "epoch": 9.104858157216063, "grad_norm": 20.084447860717773, "learning_rate": 4.485108409270167e-06, "loss": 2.2786, "step": 4349500 }, { "epoch": 9.105904812941688, "grad_norm": 17.526193618774414, "learning_rate": 4.479864152779876e-06, "loss": 2.3022, "step": 4350000 }, { "epoch": 9.106951468667313, "grad_norm": 17.74315643310547, "learning_rate": 4.474619896289583e-06, "loss": 2.2946, "step": 4350500 }, { "epoch": 9.107998124392939, "grad_norm": 18.6864070892334, "learning_rate": 4.469375639799292e-06, "loss": 2.2985, "step": 4351000 }, { "epoch": 9.109044780118564, "grad_norm": 19.84815216064453, "learning_rate": 4.464131383309e-06, "loss": 2.2841, "step": 4351500 }, { "epoch": 9.11009143584419, "grad_norm": 19.465024948120117, "learning_rate": 4.458887126818708e-06, "loss": 2.2656, "step": 4352000 }, { "epoch": 9.111138091569815, "grad_norm": 18.682144165039062, "learning_rate": 4.453642870328417e-06, "loss": 2.3036, "step": 4352500 }, { "epoch": 9.112184747295442, "grad_norm": 17.32817840576172, "learning_rate": 4.448398613838125e-06, "loss": 2.2964, "step": 4353000 }, { "epoch": 9.113231403021068, "grad_norm": 20.57769775390625, "learning_rate": 4.443154357347833e-06, "loss": 2.2947, "step": 4353500 }, { "epoch": 9.114278058746693, "grad_norm": 18.82100486755371, "learning_rate": 4.437910100857541e-06, "loss": 2.2866, "step": 4354000 }, { "epoch": 9.115324714472319, "grad_norm": 19.376663208007812, "learning_rate": 4.43266584436725e-06, "loss": 2.2886, "step": 4354500 }, { "epoch": 9.116371370197944, "grad_norm": 17.63304328918457, "learning_rate": 4.427421587876958e-06, "loss": 2.2916, "step": 4355000 }, { "epoch": 9.11741802592357, "grad_norm": 21.9445743560791, "learning_rate": 4.422177331386665e-06, "loss": 2.2777, "step": 4355500 }, { "epoch": 9.118464681649195, "grad_norm": 20.24652099609375, "learning_rate": 4.416933074896374e-06, "loss": 2.2739, "step": 4356000 }, { "epoch": 9.11951133737482, "grad_norm": 19.752864837646484, "learning_rate": 4.4116888184060815e-06, "loss": 2.3008, "step": 4356500 }, { "epoch": 9.120557993100446, "grad_norm": 18.557600021362305, "learning_rate": 4.40644456191579e-06, "loss": 2.2866, "step": 4357000 }, { "epoch": 9.121604648826072, "grad_norm": 21.034181594848633, "learning_rate": 4.4012003054254986e-06, "loss": 2.2892, "step": 4357500 }, { "epoch": 9.122651304551697, "grad_norm": 20.18700408935547, "learning_rate": 4.395956048935206e-06, "loss": 2.2968, "step": 4358000 }, { "epoch": 9.123697960277322, "grad_norm": 24.115495681762695, "learning_rate": 4.390711792444915e-06, "loss": 2.2717, "step": 4358500 }, { "epoch": 9.124744616002948, "grad_norm": 20.898883819580078, "learning_rate": 4.3854675359546225e-06, "loss": 2.2963, "step": 4359000 }, { "epoch": 9.125791271728573, "grad_norm": 18.52240562438965, "learning_rate": 4.380223279464331e-06, "loss": 2.291, "step": 4359500 }, { "epoch": 9.126837927454199, "grad_norm": 22.791391372680664, "learning_rate": 4.3749790229740396e-06, "loss": 2.2812, "step": 4360000 }, { "epoch": 9.127884583179824, "grad_norm": 21.320775985717773, "learning_rate": 4.369734766483747e-06, "loss": 2.2843, "step": 4360500 }, { "epoch": 9.12893123890545, "grad_norm": 19.17307472229004, "learning_rate": 4.364490509993455e-06, "loss": 2.27, "step": 4361000 }, { "epoch": 9.129977894631075, "grad_norm": 17.313762664794922, "learning_rate": 4.3592462535031635e-06, "loss": 2.3197, "step": 4361500 }, { "epoch": 9.1310245503567, "grad_norm": 19.11039924621582, "learning_rate": 4.354001997012871e-06, "loss": 2.2994, "step": 4362000 }, { "epoch": 9.132071206082326, "grad_norm": 24.205135345458984, "learning_rate": 4.34875774052258e-06, "loss": 2.2912, "step": 4362500 }, { "epoch": 9.133117861807952, "grad_norm": 19.963468551635742, "learning_rate": 4.343513484032288e-06, "loss": 2.2849, "step": 4363000 }, { "epoch": 9.134164517533577, "grad_norm": 20.746475219726562, "learning_rate": 4.338269227541996e-06, "loss": 2.2818, "step": 4363500 }, { "epoch": 9.135211173259203, "grad_norm": 19.90723991394043, "learning_rate": 4.3330249710517045e-06, "loss": 2.295, "step": 4364000 }, { "epoch": 9.136257828984828, "grad_norm": 24.627859115600586, "learning_rate": 4.327780714561413e-06, "loss": 2.2883, "step": 4364500 }, { "epoch": 9.137304484710453, "grad_norm": 17.122249603271484, "learning_rate": 4.322536458071121e-06, "loss": 2.2744, "step": 4365000 }, { "epoch": 9.138351140436079, "grad_norm": 14.22311782836914, "learning_rate": 4.317292201580829e-06, "loss": 2.2693, "step": 4365500 }, { "epoch": 9.139397796161704, "grad_norm": 24.063940048217773, "learning_rate": 4.312047945090538e-06, "loss": 2.2853, "step": 4366000 }, { "epoch": 9.14044445188733, "grad_norm": 16.572423934936523, "learning_rate": 4.3068036886002455e-06, "loss": 2.2811, "step": 4366500 }, { "epoch": 9.141491107612955, "grad_norm": 20.322772979736328, "learning_rate": 4.301559432109953e-06, "loss": 2.2879, "step": 4367000 }, { "epoch": 9.14253776333858, "grad_norm": 18.18848419189453, "learning_rate": 4.296315175619661e-06, "loss": 2.2925, "step": 4367500 }, { "epoch": 9.143584419064206, "grad_norm": 15.337740898132324, "learning_rate": 4.291070919129369e-06, "loss": 2.2932, "step": 4368000 }, { "epoch": 9.144631074789832, "grad_norm": 25.24283790588379, "learning_rate": 4.285826662639078e-06, "loss": 2.3137, "step": 4368500 }, { "epoch": 9.145677730515457, "grad_norm": 20.945816040039062, "learning_rate": 4.280582406148786e-06, "loss": 2.294, "step": 4369000 }, { "epoch": 9.146724386241083, "grad_norm": 18.208463668823242, "learning_rate": 4.275338149658494e-06, "loss": 2.2603, "step": 4369500 }, { "epoch": 9.147771041966708, "grad_norm": 23.2113094329834, "learning_rate": 4.270093893168203e-06, "loss": 2.2779, "step": 4370000 }, { "epoch": 9.148817697692333, "grad_norm": 21.202117919921875, "learning_rate": 4.26484963667791e-06, "loss": 2.2881, "step": 4370500 }, { "epoch": 9.149864353417959, "grad_norm": 18.373741149902344, "learning_rate": 4.259605380187619e-06, "loss": 2.2976, "step": 4371000 }, { "epoch": 9.150911009143584, "grad_norm": 17.492122650146484, "learning_rate": 4.2543611236973275e-06, "loss": 2.3094, "step": 4371500 }, { "epoch": 9.15195766486921, "grad_norm": 23.96283721923828, "learning_rate": 4.249116867207035e-06, "loss": 2.2856, "step": 4372000 }, { "epoch": 9.153004320594835, "grad_norm": 19.379653930664062, "learning_rate": 4.243872610716743e-06, "loss": 2.2985, "step": 4372500 }, { "epoch": 9.15405097632046, "grad_norm": 19.56728744506836, "learning_rate": 4.238628354226451e-06, "loss": 2.271, "step": 4373000 }, { "epoch": 9.155097632046086, "grad_norm": 19.240421295166016, "learning_rate": 4.233384097736159e-06, "loss": 2.2869, "step": 4373500 }, { "epoch": 9.156144287771712, "grad_norm": 21.687437057495117, "learning_rate": 4.228139841245868e-06, "loss": 2.3008, "step": 4374000 }, { "epoch": 9.157190943497337, "grad_norm": 21.77785301208496, "learning_rate": 4.222895584755576e-06, "loss": 2.2827, "step": 4374500 }, { "epoch": 9.158237599222963, "grad_norm": 21.231246948242188, "learning_rate": 4.217651328265284e-06, "loss": 2.2998, "step": 4375000 }, { "epoch": 9.159284254948588, "grad_norm": 19.58755874633789, "learning_rate": 4.212407071774992e-06, "loss": 2.2796, "step": 4375500 }, { "epoch": 9.160330910674213, "grad_norm": 20.424251556396484, "learning_rate": 4.2071628152847e-06, "loss": 2.2876, "step": 4376000 }, { "epoch": 9.161377566399839, "grad_norm": 21.707096099853516, "learning_rate": 4.201918558794409e-06, "loss": 2.2939, "step": 4376500 }, { "epoch": 9.162424222125464, "grad_norm": 21.109081268310547, "learning_rate": 4.196674302304117e-06, "loss": 2.2824, "step": 4377000 }, { "epoch": 9.16347087785109, "grad_norm": 19.161052703857422, "learning_rate": 4.191430045813825e-06, "loss": 2.2808, "step": 4377500 }, { "epoch": 9.164517533576715, "grad_norm": 17.61859703063965, "learning_rate": 4.186185789323533e-06, "loss": 2.2823, "step": 4378000 }, { "epoch": 9.16556418930234, "grad_norm": 17.97209930419922, "learning_rate": 4.180941532833241e-06, "loss": 2.292, "step": 4378500 }, { "epoch": 9.166610845027966, "grad_norm": 25.357887268066406, "learning_rate": 4.175697276342949e-06, "loss": 2.2909, "step": 4379000 }, { "epoch": 9.167657500753592, "grad_norm": 18.690767288208008, "learning_rate": 4.170453019852657e-06, "loss": 2.2897, "step": 4379500 }, { "epoch": 9.168704156479217, "grad_norm": 30.81415367126465, "learning_rate": 4.165208763362366e-06, "loss": 2.2915, "step": 4380000 }, { "epoch": 9.169750812204843, "grad_norm": 20.954452514648438, "learning_rate": 4.1599645068720735e-06, "loss": 2.2746, "step": 4380500 }, { "epoch": 9.170797467930468, "grad_norm": 16.843528747558594, "learning_rate": 4.154720250381782e-06, "loss": 2.3, "step": 4381000 }, { "epoch": 9.171844123656093, "grad_norm": 18.269269943237305, "learning_rate": 4.149475993891491e-06, "loss": 2.2905, "step": 4381500 }, { "epoch": 9.172890779381719, "grad_norm": 17.036094665527344, "learning_rate": 4.144231737401198e-06, "loss": 2.2849, "step": 4382000 }, { "epoch": 9.173937435107344, "grad_norm": 28.574398040771484, "learning_rate": 4.138987480910907e-06, "loss": 2.2794, "step": 4382500 }, { "epoch": 9.17498409083297, "grad_norm": 17.815813064575195, "learning_rate": 4.133743224420615e-06, "loss": 2.2882, "step": 4383000 }, { "epoch": 9.176030746558595, "grad_norm": 22.062305450439453, "learning_rate": 4.128498967930323e-06, "loss": 2.2812, "step": 4383500 }, { "epoch": 9.17707740228422, "grad_norm": 17.392078399658203, "learning_rate": 4.123254711440032e-06, "loss": 2.2875, "step": 4384000 }, { "epoch": 9.178124058009846, "grad_norm": 22.46931266784668, "learning_rate": 4.118010454949739e-06, "loss": 2.2664, "step": 4384500 }, { "epoch": 9.179170713735472, "grad_norm": 19.267295837402344, "learning_rate": 4.112766198459447e-06, "loss": 2.2821, "step": 4385000 }, { "epoch": 9.180217369461097, "grad_norm": 17.904157638549805, "learning_rate": 4.1075219419691555e-06, "loss": 2.2933, "step": 4385500 }, { "epoch": 9.181264025186723, "grad_norm": 21.78026008605957, "learning_rate": 4.102277685478863e-06, "loss": 2.3034, "step": 4386000 }, { "epoch": 9.182310680912348, "grad_norm": 18.890756607055664, "learning_rate": 4.097033428988572e-06, "loss": 2.2871, "step": 4386500 }, { "epoch": 9.183357336637973, "grad_norm": 16.331998825073242, "learning_rate": 4.09178917249828e-06, "loss": 2.2861, "step": 4387000 }, { "epoch": 9.184403992363599, "grad_norm": 16.45268440246582, "learning_rate": 4.086544916007988e-06, "loss": 2.2972, "step": 4387500 }, { "epoch": 9.185450648089226, "grad_norm": 16.979955673217773, "learning_rate": 4.0813006595176965e-06, "loss": 2.2879, "step": 4388000 }, { "epoch": 9.186497303814852, "grad_norm": 21.140522003173828, "learning_rate": 4.076056403027405e-06, "loss": 2.2892, "step": 4388500 }, { "epoch": 9.187543959540477, "grad_norm": 22.49710464477539, "learning_rate": 4.070812146537113e-06, "loss": 2.2831, "step": 4389000 }, { "epoch": 9.188590615266103, "grad_norm": 17.84767723083496, "learning_rate": 4.065567890046821e-06, "loss": 2.2976, "step": 4389500 }, { "epoch": 9.189637270991728, "grad_norm": 17.61197280883789, "learning_rate": 4.060323633556529e-06, "loss": 2.2875, "step": 4390000 }, { "epoch": 9.190683926717353, "grad_norm": 22.400436401367188, "learning_rate": 4.055079377066237e-06, "loss": 2.284, "step": 4390500 }, { "epoch": 9.191730582442979, "grad_norm": 17.42020606994629, "learning_rate": 4.049835120575945e-06, "loss": 2.2916, "step": 4391000 }, { "epoch": 9.192777238168604, "grad_norm": 22.267410278320312, "learning_rate": 4.044590864085654e-06, "loss": 2.2785, "step": 4391500 }, { "epoch": 9.19382389389423, "grad_norm": 20.972326278686523, "learning_rate": 4.0393466075953615e-06, "loss": 2.2888, "step": 4392000 }, { "epoch": 9.194870549619855, "grad_norm": 21.530155181884766, "learning_rate": 4.03410235110507e-06, "loss": 2.2996, "step": 4392500 }, { "epoch": 9.19591720534548, "grad_norm": 19.402204513549805, "learning_rate": 4.028858094614778e-06, "loss": 2.2963, "step": 4393000 }, { "epoch": 9.196963861071106, "grad_norm": 19.253549575805664, "learning_rate": 4.023613838124486e-06, "loss": 2.3008, "step": 4393500 }, { "epoch": 9.198010516796732, "grad_norm": 18.381507873535156, "learning_rate": 4.018369581634195e-06, "loss": 2.2723, "step": 4394000 }, { "epoch": 9.199057172522357, "grad_norm": 21.18077278137207, "learning_rate": 4.0131253251439025e-06, "loss": 2.2854, "step": 4394500 }, { "epoch": 9.200103828247983, "grad_norm": 19.887704849243164, "learning_rate": 4.007881068653611e-06, "loss": 2.2909, "step": 4395000 }, { "epoch": 9.201150483973608, "grad_norm": 22.059247970581055, "learning_rate": 4.0026368121633195e-06, "loss": 2.3107, "step": 4395500 }, { "epoch": 9.202197139699233, "grad_norm": 19.152057647705078, "learning_rate": 3.997392555673027e-06, "loss": 2.2658, "step": 4396000 }, { "epoch": 9.203243795424859, "grad_norm": 23.593576431274414, "learning_rate": 3.992148299182735e-06, "loss": 2.2863, "step": 4396500 }, { "epoch": 9.204290451150484, "grad_norm": 22.197248458862305, "learning_rate": 3.9869040426924434e-06, "loss": 2.2858, "step": 4397000 }, { "epoch": 9.20533710687611, "grad_norm": 19.38201332092285, "learning_rate": 3.981659786202151e-06, "loss": 2.2744, "step": 4397500 }, { "epoch": 9.206383762601735, "grad_norm": 19.979171752929688, "learning_rate": 3.97641552971186e-06, "loss": 2.2761, "step": 4398000 }, { "epoch": 9.20743041832736, "grad_norm": 18.837556838989258, "learning_rate": 3.971171273221568e-06, "loss": 2.2823, "step": 4398500 }, { "epoch": 9.208477074052986, "grad_norm": 24.058704376220703, "learning_rate": 3.965927016731276e-06, "loss": 2.2787, "step": 4399000 }, { "epoch": 9.209523729778612, "grad_norm": 19.61838150024414, "learning_rate": 3.9606827602409844e-06, "loss": 2.3128, "step": 4399500 }, { "epoch": 9.210570385504237, "grad_norm": 18.291471481323242, "learning_rate": 3.955438503750693e-06, "loss": 2.2832, "step": 4400000 }, { "epoch": 9.211617041229863, "grad_norm": 19.373430252075195, "learning_rate": 3.950194247260401e-06, "loss": 2.272, "step": 4400500 }, { "epoch": 9.212663696955488, "grad_norm": 18.677515029907227, "learning_rate": 3.944949990770109e-06, "loss": 2.2995, "step": 4401000 }, { "epoch": 9.213710352681113, "grad_norm": 22.23997688293457, "learning_rate": 3.939705734279817e-06, "loss": 2.2716, "step": 4401500 }, { "epoch": 9.214757008406739, "grad_norm": 19.234085083007812, "learning_rate": 3.934461477789525e-06, "loss": 2.29, "step": 4402000 }, { "epoch": 9.215803664132364, "grad_norm": 18.238990783691406, "learning_rate": 3.929217221299233e-06, "loss": 2.3106, "step": 4402500 }, { "epoch": 9.21685031985799, "grad_norm": 18.27689552307129, "learning_rate": 3.923972964808941e-06, "loss": 2.2907, "step": 4403000 }, { "epoch": 9.217896975583615, "grad_norm": 20.835649490356445, "learning_rate": 3.918728708318649e-06, "loss": 2.2857, "step": 4403500 }, { "epoch": 9.21894363130924, "grad_norm": 17.752702713012695, "learning_rate": 3.913484451828358e-06, "loss": 2.2942, "step": 4404000 }, { "epoch": 9.219990287034866, "grad_norm": 19.86060333251953, "learning_rate": 3.908240195338066e-06, "loss": 2.2771, "step": 4404500 }, { "epoch": 9.221036942760492, "grad_norm": 21.352933883666992, "learning_rate": 3.902995938847774e-06, "loss": 2.2933, "step": 4405000 }, { "epoch": 9.222083598486117, "grad_norm": 18.420917510986328, "learning_rate": 3.897751682357483e-06, "loss": 2.2771, "step": 4405500 }, { "epoch": 9.223130254211743, "grad_norm": 17.076642990112305, "learning_rate": 3.89250742586719e-06, "loss": 2.2767, "step": 4406000 }, { "epoch": 9.224176909937368, "grad_norm": 19.22255516052246, "learning_rate": 3.887263169376899e-06, "loss": 2.2722, "step": 4406500 }, { "epoch": 9.225223565662994, "grad_norm": 19.637544631958008, "learning_rate": 3.8820189128866074e-06, "loss": 2.3011, "step": 4407000 }, { "epoch": 9.226270221388619, "grad_norm": 20.993396759033203, "learning_rate": 3.876774656396315e-06, "loss": 2.2919, "step": 4407500 }, { "epoch": 9.227316877114244, "grad_norm": 20.843103408813477, "learning_rate": 3.871530399906023e-06, "loss": 2.2947, "step": 4408000 }, { "epoch": 9.22836353283987, "grad_norm": 18.7309627532959, "learning_rate": 3.866286143415731e-06, "loss": 2.2629, "step": 4408500 }, { "epoch": 9.229410188565495, "grad_norm": 21.131162643432617, "learning_rate": 3.861041886925439e-06, "loss": 2.2727, "step": 4409000 }, { "epoch": 9.23045684429112, "grad_norm": 18.836864471435547, "learning_rate": 3.855797630435148e-06, "loss": 2.2855, "step": 4409500 }, { "epoch": 9.231503500016746, "grad_norm": 19.034481048583984, "learning_rate": 3.850553373944855e-06, "loss": 2.2808, "step": 4410000 }, { "epoch": 9.232550155742372, "grad_norm": 21.27193832397461, "learning_rate": 3.845309117454564e-06, "loss": 2.2775, "step": 4410500 }, { "epoch": 9.233596811467997, "grad_norm": 18.932722091674805, "learning_rate": 3.840064860964272e-06, "loss": 2.3055, "step": 4411000 }, { "epoch": 9.234643467193623, "grad_norm": 19.58258819580078, "learning_rate": 3.83482060447398e-06, "loss": 2.2712, "step": 4411500 }, { "epoch": 9.235690122919248, "grad_norm": 20.739627838134766, "learning_rate": 3.829576347983689e-06, "loss": 2.286, "step": 4412000 }, { "epoch": 9.236736778644874, "grad_norm": 17.593486785888672, "learning_rate": 3.824332091493397e-06, "loss": 2.2883, "step": 4412500 }, { "epoch": 9.237783434370499, "grad_norm": 26.579673767089844, "learning_rate": 3.819087835003105e-06, "loss": 2.2946, "step": 4413000 }, { "epoch": 9.238830090096124, "grad_norm": 20.263368606567383, "learning_rate": 3.813843578512813e-06, "loss": 2.2813, "step": 4413500 }, { "epoch": 9.23987674582175, "grad_norm": 24.142623901367188, "learning_rate": 3.8085993220225215e-06, "loss": 2.2912, "step": 4414000 }, { "epoch": 9.240923401547375, "grad_norm": 20.35237693786621, "learning_rate": 3.803355065532229e-06, "loss": 2.2866, "step": 4414500 }, { "epoch": 9.241970057273, "grad_norm": 22.073959350585938, "learning_rate": 3.7981108090419373e-06, "loss": 2.2943, "step": 4415000 }, { "epoch": 9.243016712998626, "grad_norm": 21.03147315979004, "learning_rate": 3.792866552551646e-06, "loss": 2.2755, "step": 4415500 }, { "epoch": 9.244063368724252, "grad_norm": 17.554853439331055, "learning_rate": 3.7876222960613535e-06, "loss": 2.2903, "step": 4416000 }, { "epoch": 9.245110024449877, "grad_norm": 20.798084259033203, "learning_rate": 3.782378039571062e-06, "loss": 2.2739, "step": 4416500 }, { "epoch": 9.246156680175503, "grad_norm": 24.531173706054688, "learning_rate": 3.7771337830807706e-06, "loss": 2.2796, "step": 4417000 }, { "epoch": 9.247203335901128, "grad_norm": 20.44510841369629, "learning_rate": 3.7718895265904783e-06, "loss": 2.2853, "step": 4417500 }, { "epoch": 9.248249991626754, "grad_norm": 21.653724670410156, "learning_rate": 3.7666452701001864e-06, "loss": 2.2915, "step": 4418000 }, { "epoch": 9.249296647352379, "grad_norm": 17.903188705444336, "learning_rate": 3.761401013609895e-06, "loss": 2.2706, "step": 4418500 }, { "epoch": 9.250343303078004, "grad_norm": 23.67659568786621, "learning_rate": 3.7561567571196026e-06, "loss": 2.2665, "step": 4419000 }, { "epoch": 9.25138995880363, "grad_norm": 18.186599731445312, "learning_rate": 3.750912500629311e-06, "loss": 2.3039, "step": 4419500 }, { "epoch": 9.252436614529255, "grad_norm": 21.518266677856445, "learning_rate": 3.745668244139019e-06, "loss": 2.2809, "step": 4420000 }, { "epoch": 9.25348327025488, "grad_norm": 23.166595458984375, "learning_rate": 3.7404239876487274e-06, "loss": 2.2878, "step": 4420500 }, { "epoch": 9.254529925980506, "grad_norm": 17.690670013427734, "learning_rate": 3.7351797311584355e-06, "loss": 2.2949, "step": 4421000 }, { "epoch": 9.255576581706132, "grad_norm": 19.260738372802734, "learning_rate": 3.729935474668143e-06, "loss": 2.2805, "step": 4421500 }, { "epoch": 9.256623237431757, "grad_norm": 17.072120666503906, "learning_rate": 3.7246912181778517e-06, "loss": 2.2876, "step": 4422000 }, { "epoch": 9.257669893157384, "grad_norm": 26.4244384765625, "learning_rate": 3.7194469616875603e-06, "loss": 2.2825, "step": 4422500 }, { "epoch": 9.258716548883008, "grad_norm": 25.997196197509766, "learning_rate": 3.714202705197268e-06, "loss": 2.285, "step": 4423000 }, { "epoch": 9.259763204608635, "grad_norm": 22.5364990234375, "learning_rate": 3.7089584487069765e-06, "loss": 2.2796, "step": 4423500 }, { "epoch": 9.26080986033426, "grad_norm": 23.465707778930664, "learning_rate": 3.7037141922166846e-06, "loss": 2.3064, "step": 4424000 }, { "epoch": 9.261856516059886, "grad_norm": 18.055179595947266, "learning_rate": 3.6984699357263923e-06, "loss": 2.3017, "step": 4424500 }, { "epoch": 9.262903171785512, "grad_norm": 18.785274505615234, "learning_rate": 3.693225679236101e-06, "loss": 2.2939, "step": 4425000 }, { "epoch": 9.263949827511137, "grad_norm": 20.643871307373047, "learning_rate": 3.6879814227458094e-06, "loss": 2.2842, "step": 4425500 }, { "epoch": 9.264996483236763, "grad_norm": 21.1407470703125, "learning_rate": 3.682737166255517e-06, "loss": 2.2716, "step": 4426000 }, { "epoch": 9.266043138962388, "grad_norm": 18.762248992919922, "learning_rate": 3.6774929097652256e-06, "loss": 2.2802, "step": 4426500 }, { "epoch": 9.267089794688014, "grad_norm": 21.851898193359375, "learning_rate": 3.6722486532749337e-06, "loss": 2.2843, "step": 4427000 }, { "epoch": 9.268136450413639, "grad_norm": 18.710153579711914, "learning_rate": 3.6670043967846414e-06, "loss": 2.2737, "step": 4427500 }, { "epoch": 9.269183106139264, "grad_norm": 16.66525650024414, "learning_rate": 3.66176014029435e-06, "loss": 2.2964, "step": 4428000 }, { "epoch": 9.27022976186489, "grad_norm": 19.91016960144043, "learning_rate": 3.6565158838040576e-06, "loss": 2.2994, "step": 4428500 }, { "epoch": 9.271276417590515, "grad_norm": 19.429058074951172, "learning_rate": 3.651271627313766e-06, "loss": 2.2627, "step": 4429000 }, { "epoch": 9.27232307331614, "grad_norm": 17.213958740234375, "learning_rate": 3.6460273708234743e-06, "loss": 2.2756, "step": 4429500 }, { "epoch": 9.273369729041766, "grad_norm": 20.308687210083008, "learning_rate": 3.6407831143331824e-06, "loss": 2.2751, "step": 4430000 }, { "epoch": 9.274416384767392, "grad_norm": 19.282485961914062, "learning_rate": 3.6355388578428905e-06, "loss": 2.2907, "step": 4430500 }, { "epoch": 9.275463040493017, "grad_norm": 22.604110717773438, "learning_rate": 3.630294601352599e-06, "loss": 2.2864, "step": 4431000 }, { "epoch": 9.276509696218643, "grad_norm": 17.02770233154297, "learning_rate": 3.6250503448623068e-06, "loss": 2.2534, "step": 4431500 }, { "epoch": 9.277556351944268, "grad_norm": 18.991140365600586, "learning_rate": 3.6198060883720153e-06, "loss": 2.279, "step": 4432000 }, { "epoch": 9.278603007669894, "grad_norm": 19.077239990234375, "learning_rate": 3.6145618318817234e-06, "loss": 2.277, "step": 4432500 }, { "epoch": 9.279649663395519, "grad_norm": 15.996007919311523, "learning_rate": 3.609317575391431e-06, "loss": 2.2776, "step": 4433000 }, { "epoch": 9.280696319121144, "grad_norm": 17.481891632080078, "learning_rate": 3.6040733189011396e-06, "loss": 2.2907, "step": 4433500 }, { "epoch": 9.28174297484677, "grad_norm": 17.24573516845703, "learning_rate": 3.598829062410848e-06, "loss": 2.291, "step": 4434000 }, { "epoch": 9.282789630572395, "grad_norm": 20.22844123840332, "learning_rate": 3.593584805920556e-06, "loss": 2.2548, "step": 4434500 }, { "epoch": 9.28383628629802, "grad_norm": 23.27031135559082, "learning_rate": 3.5883405494302644e-06, "loss": 2.2918, "step": 4435000 }, { "epoch": 9.284882942023646, "grad_norm": 17.480432510375977, "learning_rate": 3.5830962929399725e-06, "loss": 2.2868, "step": 4435500 }, { "epoch": 9.285929597749272, "grad_norm": 18.90777015686035, "learning_rate": 3.57785203644968e-06, "loss": 2.2715, "step": 4436000 }, { "epoch": 9.286976253474897, "grad_norm": 19.3804931640625, "learning_rate": 3.5726077799593887e-06, "loss": 2.279, "step": 4436500 }, { "epoch": 9.288022909200523, "grad_norm": 22.04200553894043, "learning_rate": 3.5673635234690964e-06, "loss": 2.2897, "step": 4437000 }, { "epoch": 9.289069564926148, "grad_norm": 17.810609817504883, "learning_rate": 3.562119266978805e-06, "loss": 2.2884, "step": 4437500 }, { "epoch": 9.290116220651774, "grad_norm": 15.982083320617676, "learning_rate": 3.5568750104885135e-06, "loss": 2.283, "step": 4438000 }, { "epoch": 9.291162876377399, "grad_norm": 23.540910720825195, "learning_rate": 3.551630753998221e-06, "loss": 2.2826, "step": 4438500 }, { "epoch": 9.292209532103024, "grad_norm": 19.20177459716797, "learning_rate": 3.5463864975079293e-06, "loss": 2.3061, "step": 4439000 }, { "epoch": 9.29325618782865, "grad_norm": 20.368595123291016, "learning_rate": 3.541142241017638e-06, "loss": 2.2806, "step": 4439500 }, { "epoch": 9.294302843554275, "grad_norm": 20.915782928466797, "learning_rate": 3.5358979845273456e-06, "loss": 2.2821, "step": 4440000 }, { "epoch": 9.2953494992799, "grad_norm": 17.152868270874023, "learning_rate": 3.530653728037054e-06, "loss": 2.2673, "step": 4440500 }, { "epoch": 9.296396155005526, "grad_norm": 17.349132537841797, "learning_rate": 3.525409471546762e-06, "loss": 2.2926, "step": 4441000 }, { "epoch": 9.297442810731152, "grad_norm": 23.81138038635254, "learning_rate": 3.5201652150564703e-06, "loss": 2.2786, "step": 4441500 }, { "epoch": 9.298489466456777, "grad_norm": 21.2548828125, "learning_rate": 3.5149209585661784e-06, "loss": 2.2869, "step": 4442000 }, { "epoch": 9.299536122182403, "grad_norm": 18.95099639892578, "learning_rate": 3.509676702075887e-06, "loss": 2.2556, "step": 4442500 }, { "epoch": 9.300582777908028, "grad_norm": 21.76947784423828, "learning_rate": 3.5044324455855947e-06, "loss": 2.2741, "step": 4443000 }, { "epoch": 9.301629433633654, "grad_norm": 21.59093475341797, "learning_rate": 3.499188189095303e-06, "loss": 2.2739, "step": 4443500 }, { "epoch": 9.302676089359279, "grad_norm": 25.101882934570312, "learning_rate": 3.4939439326050113e-06, "loss": 2.2983, "step": 4444000 }, { "epoch": 9.303722745084904, "grad_norm": 20.026227951049805, "learning_rate": 3.488699676114719e-06, "loss": 2.2727, "step": 4444500 }, { "epoch": 9.30476940081053, "grad_norm": 21.642250061035156, "learning_rate": 3.4834554196244275e-06, "loss": 2.2851, "step": 4445000 }, { "epoch": 9.305816056536155, "grad_norm": 18.36587905883789, "learning_rate": 3.4782111631341352e-06, "loss": 2.273, "step": 4445500 }, { "epoch": 9.30686271226178, "grad_norm": 20.162981033325195, "learning_rate": 3.4729669066438438e-06, "loss": 2.2764, "step": 4446000 }, { "epoch": 9.307909367987406, "grad_norm": 23.069665908813477, "learning_rate": 3.4677226501535523e-06, "loss": 2.2794, "step": 4446500 }, { "epoch": 9.308956023713032, "grad_norm": 18.348405838012695, "learning_rate": 3.46247839366326e-06, "loss": 2.2682, "step": 4447000 }, { "epoch": 9.310002679438657, "grad_norm": 23.24551773071289, "learning_rate": 3.457234137172968e-06, "loss": 2.2638, "step": 4447500 }, { "epoch": 9.311049335164283, "grad_norm": 22.026296615600586, "learning_rate": 3.4519898806826767e-06, "loss": 2.2722, "step": 4448000 }, { "epoch": 9.312095990889908, "grad_norm": 18.762889862060547, "learning_rate": 3.4467456241923843e-06, "loss": 2.2642, "step": 4448500 }, { "epoch": 9.313142646615534, "grad_norm": 16.014949798583984, "learning_rate": 3.441501367702093e-06, "loss": 2.2884, "step": 4449000 }, { "epoch": 9.314189302341159, "grad_norm": 19.07622718811035, "learning_rate": 3.4362571112118014e-06, "loss": 2.2727, "step": 4449500 }, { "epoch": 9.315235958066785, "grad_norm": 19.085323333740234, "learning_rate": 3.431012854721509e-06, "loss": 2.2904, "step": 4450000 }, { "epoch": 9.31628261379241, "grad_norm": 21.63385009765625, "learning_rate": 3.4257685982312172e-06, "loss": 2.2689, "step": 4450500 }, { "epoch": 9.317329269518035, "grad_norm": 22.917495727539062, "learning_rate": 3.4205243417409258e-06, "loss": 2.2751, "step": 4451000 }, { "epoch": 9.31837592524366, "grad_norm": 20.63129234313965, "learning_rate": 3.4152800852506335e-06, "loss": 2.2779, "step": 4451500 }, { "epoch": 9.319422580969286, "grad_norm": 19.61370849609375, "learning_rate": 3.410035828760342e-06, "loss": 2.2887, "step": 4452000 }, { "epoch": 9.320469236694912, "grad_norm": 27.248231887817383, "learning_rate": 3.4047915722700505e-06, "loss": 2.2736, "step": 4452500 }, { "epoch": 9.321515892420537, "grad_norm": 20.17431640625, "learning_rate": 3.3995473157797582e-06, "loss": 2.2627, "step": 4453000 }, { "epoch": 9.322562548146163, "grad_norm": 21.01453971862793, "learning_rate": 3.3943030592894663e-06, "loss": 2.2859, "step": 4453500 }, { "epoch": 9.323609203871788, "grad_norm": 22.328529357910156, "learning_rate": 3.389058802799174e-06, "loss": 2.2795, "step": 4454000 }, { "epoch": 9.324655859597414, "grad_norm": 18.320640563964844, "learning_rate": 3.3838145463088826e-06, "loss": 2.2721, "step": 4454500 }, { "epoch": 9.325702515323039, "grad_norm": 20.254974365234375, "learning_rate": 3.378570289818591e-06, "loss": 2.2855, "step": 4455000 }, { "epoch": 9.326749171048665, "grad_norm": 18.133684158325195, "learning_rate": 3.373326033328299e-06, "loss": 2.2933, "step": 4455500 }, { "epoch": 9.32779582677429, "grad_norm": 18.567306518554688, "learning_rate": 3.3680817768380073e-06, "loss": 2.2827, "step": 4456000 }, { "epoch": 9.328842482499915, "grad_norm": 17.48326301574707, "learning_rate": 3.3628375203477155e-06, "loss": 2.2697, "step": 4456500 }, { "epoch": 9.329889138225541, "grad_norm": 19.556678771972656, "learning_rate": 3.357593263857423e-06, "loss": 2.2888, "step": 4457000 }, { "epoch": 9.330935793951166, "grad_norm": 19.885360717773438, "learning_rate": 3.3523490073671317e-06, "loss": 2.2499, "step": 4457500 }, { "epoch": 9.331982449676794, "grad_norm": 18.06813621520996, "learning_rate": 3.3471047508768402e-06, "loss": 2.2802, "step": 4458000 }, { "epoch": 9.333029105402419, "grad_norm": 17.302576065063477, "learning_rate": 3.341860494386548e-06, "loss": 2.2618, "step": 4458500 }, { "epoch": 9.334075761128044, "grad_norm": 21.26197052001953, "learning_rate": 3.336616237896256e-06, "loss": 2.2757, "step": 4459000 }, { "epoch": 9.33512241685367, "grad_norm": 24.758995056152344, "learning_rate": 3.3313719814059646e-06, "loss": 2.2715, "step": 4459500 }, { "epoch": 9.336169072579295, "grad_norm": 17.756546020507812, "learning_rate": 3.3261277249156723e-06, "loss": 2.2795, "step": 4460000 }, { "epoch": 9.33721572830492, "grad_norm": 18.53556251525879, "learning_rate": 3.320883468425381e-06, "loss": 2.2872, "step": 4460500 }, { "epoch": 9.338262384030546, "grad_norm": 18.82870864868164, "learning_rate": 3.3156392119350893e-06, "loss": 2.293, "step": 4461000 }, { "epoch": 9.339309039756172, "grad_norm": 23.395771026611328, "learning_rate": 3.310394955444797e-06, "loss": 2.2824, "step": 4461500 }, { "epoch": 9.340355695481797, "grad_norm": 20.375104904174805, "learning_rate": 3.305150698954505e-06, "loss": 2.2804, "step": 4462000 }, { "epoch": 9.341402351207423, "grad_norm": 28.600778579711914, "learning_rate": 3.299906442464213e-06, "loss": 2.2962, "step": 4462500 }, { "epoch": 9.342449006933048, "grad_norm": 19.321773529052734, "learning_rate": 3.2946621859739214e-06, "loss": 2.2876, "step": 4463000 }, { "epoch": 9.343495662658674, "grad_norm": 17.839889526367188, "learning_rate": 3.28941792948363e-06, "loss": 2.279, "step": 4463500 }, { "epoch": 9.344542318384299, "grad_norm": 16.33328628540039, "learning_rate": 3.2841736729933376e-06, "loss": 2.2732, "step": 4464000 }, { "epoch": 9.345588974109925, "grad_norm": 21.319732666015625, "learning_rate": 3.278929416503046e-06, "loss": 2.2955, "step": 4464500 }, { "epoch": 9.34663562983555, "grad_norm": 22.49357032775879, "learning_rate": 3.2736851600127543e-06, "loss": 2.2561, "step": 4465000 }, { "epoch": 9.347682285561175, "grad_norm": 17.07538604736328, "learning_rate": 3.268440903522462e-06, "loss": 2.2749, "step": 4465500 }, { "epoch": 9.3487289412868, "grad_norm": 18.305496215820312, "learning_rate": 3.2631966470321705e-06, "loss": 2.2893, "step": 4466000 }, { "epoch": 9.349775597012426, "grad_norm": 22.014144897460938, "learning_rate": 3.257952390541879e-06, "loss": 2.3062, "step": 4466500 }, { "epoch": 9.350822252738052, "grad_norm": 20.91327667236328, "learning_rate": 3.2527081340515867e-06, "loss": 2.2835, "step": 4467000 }, { "epoch": 9.351868908463677, "grad_norm": 21.974794387817383, "learning_rate": 3.2474638775612952e-06, "loss": 2.2758, "step": 4467500 }, { "epoch": 9.352915564189303, "grad_norm": 20.850360870361328, "learning_rate": 3.2422196210710034e-06, "loss": 2.2799, "step": 4468000 }, { "epoch": 9.353962219914928, "grad_norm": 17.104040145874023, "learning_rate": 3.236975364580711e-06, "loss": 2.259, "step": 4468500 }, { "epoch": 9.355008875640554, "grad_norm": 22.329442977905273, "learning_rate": 3.2317311080904196e-06, "loss": 2.2936, "step": 4469000 }, { "epoch": 9.356055531366179, "grad_norm": 21.343570709228516, "learning_rate": 3.226486851600128e-06, "loss": 2.2706, "step": 4469500 }, { "epoch": 9.357102187091805, "grad_norm": 17.726253509521484, "learning_rate": 3.221242595109836e-06, "loss": 2.2613, "step": 4470000 }, { "epoch": 9.35814884281743, "grad_norm": 26.13218879699707, "learning_rate": 3.215998338619544e-06, "loss": 2.2873, "step": 4470500 }, { "epoch": 9.359195498543055, "grad_norm": 21.039480209350586, "learning_rate": 3.210754082129252e-06, "loss": 2.2743, "step": 4471000 }, { "epoch": 9.360242154268681, "grad_norm": 23.068546295166016, "learning_rate": 3.20550982563896e-06, "loss": 2.2702, "step": 4471500 }, { "epoch": 9.361288809994306, "grad_norm": 18.48598861694336, "learning_rate": 3.2002655691486687e-06, "loss": 2.2822, "step": 4472000 }, { "epoch": 9.362335465719932, "grad_norm": 17.87399673461914, "learning_rate": 3.1950213126583764e-06, "loss": 2.2832, "step": 4472500 }, { "epoch": 9.363382121445557, "grad_norm": 20.041589736938477, "learning_rate": 3.189777056168085e-06, "loss": 2.2816, "step": 4473000 }, { "epoch": 9.364428777171183, "grad_norm": 21.6924991607666, "learning_rate": 3.184532799677793e-06, "loss": 2.2782, "step": 4473500 }, { "epoch": 9.365475432896808, "grad_norm": 18.272390365600586, "learning_rate": 3.1792885431875007e-06, "loss": 2.2736, "step": 4474000 }, { "epoch": 9.366522088622434, "grad_norm": 16.035541534423828, "learning_rate": 3.1740442866972093e-06, "loss": 2.2879, "step": 4474500 }, { "epoch": 9.367568744348059, "grad_norm": 20.943544387817383, "learning_rate": 3.168800030206918e-06, "loss": 2.2804, "step": 4475000 }, { "epoch": 9.368615400073685, "grad_norm": 15.797804832458496, "learning_rate": 3.1635557737166255e-06, "loss": 2.2686, "step": 4475500 }, { "epoch": 9.36966205579931, "grad_norm": 18.32731819152832, "learning_rate": 3.158311517226334e-06, "loss": 2.2761, "step": 4476000 }, { "epoch": 9.370708711524935, "grad_norm": 17.697959899902344, "learning_rate": 3.153067260736042e-06, "loss": 2.2749, "step": 4476500 }, { "epoch": 9.371755367250561, "grad_norm": 20.72211456298828, "learning_rate": 3.14782300424575e-06, "loss": 2.2805, "step": 4477000 }, { "epoch": 9.372802022976186, "grad_norm": 16.600521087646484, "learning_rate": 3.1425787477554584e-06, "loss": 2.2626, "step": 4477500 }, { "epoch": 9.373848678701812, "grad_norm": 18.857118606567383, "learning_rate": 3.137334491265167e-06, "loss": 2.2707, "step": 4478000 }, { "epoch": 9.374895334427437, "grad_norm": 22.30640983581543, "learning_rate": 3.1320902347748746e-06, "loss": 2.2979, "step": 4478500 }, { "epoch": 9.375941990153063, "grad_norm": 19.644269943237305, "learning_rate": 3.126845978284583e-06, "loss": 2.2593, "step": 4479000 }, { "epoch": 9.376988645878688, "grad_norm": 19.408536911010742, "learning_rate": 3.121601721794291e-06, "loss": 2.2739, "step": 4479500 }, { "epoch": 9.378035301604314, "grad_norm": 21.23906135559082, "learning_rate": 3.116357465303999e-06, "loss": 2.2819, "step": 4480000 }, { "epoch": 9.379081957329939, "grad_norm": 21.092884063720703, "learning_rate": 3.1111132088137075e-06, "loss": 2.2555, "step": 4480500 }, { "epoch": 9.380128613055565, "grad_norm": 18.821908950805664, "learning_rate": 3.1058689523234156e-06, "loss": 2.2684, "step": 4481000 }, { "epoch": 9.38117526878119, "grad_norm": 23.560489654541016, "learning_rate": 3.1006246958331237e-06, "loss": 2.2704, "step": 4481500 }, { "epoch": 9.382221924506815, "grad_norm": 20.7930908203125, "learning_rate": 3.095380439342832e-06, "loss": 2.2626, "step": 4482000 }, { "epoch": 9.383268580232441, "grad_norm": 20.701223373413086, "learning_rate": 3.09013618285254e-06, "loss": 2.2631, "step": 4482500 }, { "epoch": 9.384315235958066, "grad_norm": 18.342422485351562, "learning_rate": 3.084891926362248e-06, "loss": 2.2708, "step": 4483000 }, { "epoch": 9.385361891683692, "grad_norm": 19.439952850341797, "learning_rate": 3.079647669871956e-06, "loss": 2.2662, "step": 4483500 }, { "epoch": 9.386408547409317, "grad_norm": 18.09695816040039, "learning_rate": 3.0744034133816647e-06, "loss": 2.291, "step": 4484000 }, { "epoch": 9.387455203134943, "grad_norm": 22.447349548339844, "learning_rate": 3.069159156891373e-06, "loss": 2.2677, "step": 4484500 }, { "epoch": 9.388501858860568, "grad_norm": 20.407575607299805, "learning_rate": 3.063914900401081e-06, "loss": 2.2876, "step": 4485000 }, { "epoch": 9.389548514586194, "grad_norm": 17.72489356994629, "learning_rate": 3.058670643910789e-06, "loss": 2.2635, "step": 4485500 }, { "epoch": 9.39059517031182, "grad_norm": 19.36278533935547, "learning_rate": 3.053426387420497e-06, "loss": 2.2732, "step": 4486000 }, { "epoch": 9.391641826037445, "grad_norm": 19.922956466674805, "learning_rate": 3.0481821309302053e-06, "loss": 2.3054, "step": 4486500 }, { "epoch": 9.39268848176307, "grad_norm": 20.684906005859375, "learning_rate": 3.0429378744399134e-06, "loss": 2.2795, "step": 4487000 }, { "epoch": 9.393735137488695, "grad_norm": 18.03009033203125, "learning_rate": 3.037693617949622e-06, "loss": 2.2891, "step": 4487500 }, { "epoch": 9.394781793214321, "grad_norm": 20.429933547973633, "learning_rate": 3.03244936145933e-06, "loss": 2.2826, "step": 4488000 }, { "epoch": 9.395828448939946, "grad_norm": 21.82141876220703, "learning_rate": 3.0272051049690378e-06, "loss": 2.2658, "step": 4488500 }, { "epoch": 9.396875104665572, "grad_norm": 17.468692779541016, "learning_rate": 3.0219608484787463e-06, "loss": 2.2984, "step": 4489000 }, { "epoch": 9.397921760391197, "grad_norm": 19.081832885742188, "learning_rate": 3.0167165919884544e-06, "loss": 2.2736, "step": 4489500 }, { "epoch": 9.398968416116823, "grad_norm": 18.400793075561523, "learning_rate": 3.0114723354981625e-06, "loss": 2.2687, "step": 4490000 }, { "epoch": 9.400015071842448, "grad_norm": 18.252248764038086, "learning_rate": 3.0062280790078706e-06, "loss": 2.3056, "step": 4490500 }, { "epoch": 9.401061727568074, "grad_norm": 20.20081329345703, "learning_rate": 3.0009838225175788e-06, "loss": 2.2747, "step": 4491000 }, { "epoch": 9.4021083832937, "grad_norm": 22.813974380493164, "learning_rate": 2.995739566027287e-06, "loss": 2.2951, "step": 4491500 }, { "epoch": 9.403155039019325, "grad_norm": 19.81845474243164, "learning_rate": 2.990495309536995e-06, "loss": 2.2647, "step": 4492000 }, { "epoch": 9.404201694744952, "grad_norm": 18.321609497070312, "learning_rate": 2.9852510530467035e-06, "loss": 2.2715, "step": 4492500 }, { "epoch": 9.405248350470576, "grad_norm": 21.613605499267578, "learning_rate": 2.9800067965564116e-06, "loss": 2.2693, "step": 4493000 }, { "epoch": 9.406295006196203, "grad_norm": 26.154577255249023, "learning_rate": 2.9747625400661198e-06, "loss": 2.2725, "step": 4493500 }, { "epoch": 9.407341661921828, "grad_norm": 21.758941650390625, "learning_rate": 2.969518283575828e-06, "loss": 2.2681, "step": 4494000 }, { "epoch": 9.408388317647454, "grad_norm": 19.452844619750977, "learning_rate": 2.964274027085536e-06, "loss": 2.2628, "step": 4494500 }, { "epoch": 9.409434973373079, "grad_norm": 16.428688049316406, "learning_rate": 2.959029770595244e-06, "loss": 2.2742, "step": 4495000 }, { "epoch": 9.410481629098705, "grad_norm": 18.4339656829834, "learning_rate": 2.9537855141049522e-06, "loss": 2.2703, "step": 4495500 }, { "epoch": 9.41152828482433, "grad_norm": 21.99226951599121, "learning_rate": 2.9485412576146608e-06, "loss": 2.2705, "step": 4496000 }, { "epoch": 9.412574940549955, "grad_norm": 25.182697296142578, "learning_rate": 2.943297001124369e-06, "loss": 2.283, "step": 4496500 }, { "epoch": 9.413621596275581, "grad_norm": 18.73259735107422, "learning_rate": 2.938052744634077e-06, "loss": 2.2759, "step": 4497000 }, { "epoch": 9.414668252001206, "grad_norm": 21.643856048583984, "learning_rate": 2.932808488143785e-06, "loss": 2.2603, "step": 4497500 }, { "epoch": 9.415714907726832, "grad_norm": 18.14364242553711, "learning_rate": 2.927564231653493e-06, "loss": 2.2925, "step": 4498000 }, { "epoch": 9.416761563452457, "grad_norm": 17.673852920532227, "learning_rate": 2.9223199751632013e-06, "loss": 2.2864, "step": 4498500 }, { "epoch": 9.417808219178083, "grad_norm": 20.64811897277832, "learning_rate": 2.91707571867291e-06, "loss": 2.2679, "step": 4499000 }, { "epoch": 9.418854874903708, "grad_norm": 18.664325714111328, "learning_rate": 2.911831462182618e-06, "loss": 2.2967, "step": 4499500 }, { "epoch": 9.419901530629334, "grad_norm": 24.847583770751953, "learning_rate": 2.9065872056923257e-06, "loss": 2.3023, "step": 4500000 }, { "epoch": 9.42094818635496, "grad_norm": 18.811235427856445, "learning_rate": 2.9013429492020338e-06, "loss": 2.302, "step": 4500500 }, { "epoch": 9.421994842080585, "grad_norm": 17.51206398010254, "learning_rate": 2.8960986927117423e-06, "loss": 2.2959, "step": 4501000 }, { "epoch": 9.42304149780621, "grad_norm": 20.04773712158203, "learning_rate": 2.8908544362214504e-06, "loss": 2.2874, "step": 4501500 }, { "epoch": 9.424088153531835, "grad_norm": 19.016279220581055, "learning_rate": 2.8856101797311586e-06, "loss": 2.2616, "step": 4502000 }, { "epoch": 9.425134809257461, "grad_norm": 20.8323974609375, "learning_rate": 2.880365923240867e-06, "loss": 2.2911, "step": 4502500 }, { "epoch": 9.426181464983086, "grad_norm": 16.489879608154297, "learning_rate": 2.8751216667505748e-06, "loss": 2.2802, "step": 4503000 }, { "epoch": 9.427228120708712, "grad_norm": 23.83788299560547, "learning_rate": 2.869877410260283e-06, "loss": 2.2676, "step": 4503500 }, { "epoch": 9.428274776434337, "grad_norm": 19.363256454467773, "learning_rate": 2.864633153769991e-06, "loss": 2.2788, "step": 4504000 }, { "epoch": 9.429321432159963, "grad_norm": 18.54001808166504, "learning_rate": 2.8593888972796995e-06, "loss": 2.2566, "step": 4504500 }, { "epoch": 9.430368087885588, "grad_norm": 19.760265350341797, "learning_rate": 2.8541446407894077e-06, "loss": 2.2684, "step": 4505000 }, { "epoch": 9.431414743611214, "grad_norm": 16.15563201904297, "learning_rate": 2.8489003842991158e-06, "loss": 2.2756, "step": 4505500 }, { "epoch": 9.43246139933684, "grad_norm": 20.064857482910156, "learning_rate": 2.843656127808824e-06, "loss": 2.2584, "step": 4506000 }, { "epoch": 9.433508055062465, "grad_norm": 20.883291244506836, "learning_rate": 2.838411871318532e-06, "loss": 2.2705, "step": 4506500 }, { "epoch": 9.43455471078809, "grad_norm": 20.164310455322266, "learning_rate": 2.83316761482824e-06, "loss": 2.2776, "step": 4507000 }, { "epoch": 9.435601366513716, "grad_norm": 18.35101318359375, "learning_rate": 2.8279233583379487e-06, "loss": 2.2868, "step": 4507500 }, { "epoch": 9.436648022239341, "grad_norm": 19.531997680664062, "learning_rate": 2.8226791018476568e-06, "loss": 2.2611, "step": 4508000 }, { "epoch": 9.437694677964966, "grad_norm": 19.079402923583984, "learning_rate": 2.817434845357365e-06, "loss": 2.2706, "step": 4508500 }, { "epoch": 9.438741333690592, "grad_norm": 19.734542846679688, "learning_rate": 2.8121905888670726e-06, "loss": 2.2838, "step": 4509000 }, { "epoch": 9.439787989416217, "grad_norm": 18.340749740600586, "learning_rate": 2.806946332376781e-06, "loss": 2.2642, "step": 4509500 }, { "epoch": 9.440834645141843, "grad_norm": 19.376203536987305, "learning_rate": 2.8017020758864892e-06, "loss": 2.2947, "step": 4510000 }, { "epoch": 9.441881300867468, "grad_norm": 18.3689022064209, "learning_rate": 2.7964578193961973e-06, "loss": 2.2692, "step": 4510500 }, { "epoch": 9.442927956593094, "grad_norm": 18.196712493896484, "learning_rate": 2.791213562905906e-06, "loss": 2.2642, "step": 4511000 }, { "epoch": 9.44397461231872, "grad_norm": 19.89090347290039, "learning_rate": 2.785969306415614e-06, "loss": 2.2645, "step": 4511500 }, { "epoch": 9.445021268044345, "grad_norm": 19.982572555541992, "learning_rate": 2.7807250499253217e-06, "loss": 2.2826, "step": 4512000 }, { "epoch": 9.44606792376997, "grad_norm": 18.772241592407227, "learning_rate": 2.77548079343503e-06, "loss": 2.2895, "step": 4512500 }, { "epoch": 9.447114579495596, "grad_norm": 19.250459671020508, "learning_rate": 2.7702365369447383e-06, "loss": 2.2874, "step": 4513000 }, { "epoch": 9.448161235221221, "grad_norm": 19.564289093017578, "learning_rate": 2.7649922804544465e-06, "loss": 2.287, "step": 4513500 }, { "epoch": 9.449207890946846, "grad_norm": 19.857717514038086, "learning_rate": 2.7597480239641546e-06, "loss": 2.2751, "step": 4514000 }, { "epoch": 9.450254546672472, "grad_norm": 23.813962936401367, "learning_rate": 2.7545037674738627e-06, "loss": 2.2424, "step": 4514500 }, { "epoch": 9.451301202398097, "grad_norm": 25.025609970092773, "learning_rate": 2.749259510983571e-06, "loss": 2.2839, "step": 4515000 }, { "epoch": 9.452347858123723, "grad_norm": 16.605770111083984, "learning_rate": 2.744015254493279e-06, "loss": 2.2792, "step": 4515500 }, { "epoch": 9.453394513849348, "grad_norm": 22.587543487548828, "learning_rate": 2.7387709980029875e-06, "loss": 2.2688, "step": 4516000 }, { "epoch": 9.454441169574974, "grad_norm": 22.371877670288086, "learning_rate": 2.7335267415126956e-06, "loss": 2.2878, "step": 4516500 }, { "epoch": 9.4554878253006, "grad_norm": 19.463083267211914, "learning_rate": 2.7282824850224037e-06, "loss": 2.2721, "step": 4517000 }, { "epoch": 9.456534481026225, "grad_norm": 21.374292373657227, "learning_rate": 2.723038228532112e-06, "loss": 2.2874, "step": 4517500 }, { "epoch": 9.45758113675185, "grad_norm": 22.671363830566406, "learning_rate": 2.71779397204182e-06, "loss": 2.2716, "step": 4518000 }, { "epoch": 9.458627792477476, "grad_norm": 21.35173988342285, "learning_rate": 2.712549715551528e-06, "loss": 2.2887, "step": 4518500 }, { "epoch": 9.459674448203101, "grad_norm": 16.53076171875, "learning_rate": 2.707305459061236e-06, "loss": 2.2755, "step": 4519000 }, { "epoch": 9.460721103928726, "grad_norm": 20.585205078125, "learning_rate": 2.7020612025709447e-06, "loss": 2.278, "step": 4519500 }, { "epoch": 9.461767759654352, "grad_norm": 22.269563674926758, "learning_rate": 2.696816946080653e-06, "loss": 2.2653, "step": 4520000 }, { "epoch": 9.462814415379977, "grad_norm": 19.063833236694336, "learning_rate": 2.6915726895903605e-06, "loss": 2.2722, "step": 4520500 }, { "epoch": 9.463861071105603, "grad_norm": 20.644556045532227, "learning_rate": 2.6863284331000686e-06, "loss": 2.2633, "step": 4521000 }, { "epoch": 9.464907726831228, "grad_norm": 17.041353225708008, "learning_rate": 2.681084176609777e-06, "loss": 2.2667, "step": 4521500 }, { "epoch": 9.465954382556854, "grad_norm": 21.037372589111328, "learning_rate": 2.6758399201194853e-06, "loss": 2.2715, "step": 4522000 }, { "epoch": 9.46700103828248, "grad_norm": 18.39352798461914, "learning_rate": 2.6705956636291934e-06, "loss": 2.2795, "step": 4522500 }, { "epoch": 9.468047694008105, "grad_norm": 23.964929580688477, "learning_rate": 2.665351407138902e-06, "loss": 2.2616, "step": 4523000 }, { "epoch": 9.46909434973373, "grad_norm": 20.796173095703125, "learning_rate": 2.6601071506486096e-06, "loss": 2.2696, "step": 4523500 }, { "epoch": 9.470141005459356, "grad_norm": 24.2581787109375, "learning_rate": 2.6548628941583177e-06, "loss": 2.2431, "step": 4524000 }, { "epoch": 9.471187661184981, "grad_norm": 22.5634822845459, "learning_rate": 2.6496186376680263e-06, "loss": 2.2705, "step": 4524500 }, { "epoch": 9.472234316910606, "grad_norm": 19.59341049194336, "learning_rate": 2.6443743811777344e-06, "loss": 2.2732, "step": 4525000 }, { "epoch": 9.473280972636232, "grad_norm": 20.430068969726562, "learning_rate": 2.6391301246874425e-06, "loss": 2.2876, "step": 4525500 }, { "epoch": 9.474327628361857, "grad_norm": 27.80226707458496, "learning_rate": 2.6338858681971506e-06, "loss": 2.2863, "step": 4526000 }, { "epoch": 9.475374284087483, "grad_norm": 20.14683723449707, "learning_rate": 2.6286416117068587e-06, "loss": 2.2637, "step": 4526500 }, { "epoch": 9.47642093981311, "grad_norm": 19.923683166503906, "learning_rate": 2.623397355216567e-06, "loss": 2.2769, "step": 4527000 }, { "epoch": 9.477467595538734, "grad_norm": 21.23311424255371, "learning_rate": 2.618153098726275e-06, "loss": 2.2676, "step": 4527500 }, { "epoch": 9.478514251264361, "grad_norm": 19.830358505249023, "learning_rate": 2.6129088422359835e-06, "loss": 2.2667, "step": 4528000 }, { "epoch": 9.479560906989986, "grad_norm": 17.09720802307129, "learning_rate": 2.6076645857456916e-06, "loss": 2.277, "step": 4528500 }, { "epoch": 9.480607562715612, "grad_norm": 22.423799514770508, "learning_rate": 2.6024203292553997e-06, "loss": 2.302, "step": 4529000 }, { "epoch": 9.481654218441237, "grad_norm": 19.615890502929688, "learning_rate": 2.597176072765108e-06, "loss": 2.2813, "step": 4529500 }, { "epoch": 9.482700874166863, "grad_norm": 19.104385375976562, "learning_rate": 2.591931816274816e-06, "loss": 2.2541, "step": 4530000 }, { "epoch": 9.483747529892488, "grad_norm": 18.18878936767578, "learning_rate": 2.586687559784524e-06, "loss": 2.288, "step": 4530500 }, { "epoch": 9.484794185618114, "grad_norm": 20.15028190612793, "learning_rate": 2.581443303294232e-06, "loss": 2.2638, "step": 4531000 }, { "epoch": 9.48584084134374, "grad_norm": 17.733789443969727, "learning_rate": 2.5761990468039407e-06, "loss": 2.2661, "step": 4531500 }, { "epoch": 9.486887497069365, "grad_norm": 20.912355422973633, "learning_rate": 2.570954790313649e-06, "loss": 2.2725, "step": 4532000 }, { "epoch": 9.48793415279499, "grad_norm": 17.033737182617188, "learning_rate": 2.5657105338233565e-06, "loss": 2.27, "step": 4532500 }, { "epoch": 9.488980808520616, "grad_norm": 21.159072875976562, "learning_rate": 2.560466277333065e-06, "loss": 2.2768, "step": 4533000 }, { "epoch": 9.490027464246241, "grad_norm": 20.344961166381836, "learning_rate": 2.555222020842773e-06, "loss": 2.2712, "step": 4533500 }, { "epoch": 9.491074119971866, "grad_norm": 17.19852638244629, "learning_rate": 2.5499777643524813e-06, "loss": 2.295, "step": 4534000 }, { "epoch": 9.492120775697492, "grad_norm": 18.952333450317383, "learning_rate": 2.5447335078621894e-06, "loss": 2.2683, "step": 4534500 }, { "epoch": 9.493167431423117, "grad_norm": 22.249528884887695, "learning_rate": 2.5394892513718975e-06, "loss": 2.2776, "step": 4535000 }, { "epoch": 9.494214087148743, "grad_norm": 21.355072021484375, "learning_rate": 2.5342449948816056e-06, "loss": 2.2747, "step": 4535500 }, { "epoch": 9.495260742874368, "grad_norm": 23.418487548828125, "learning_rate": 2.5290007383913137e-06, "loss": 2.2708, "step": 4536000 }, { "epoch": 9.496307398599994, "grad_norm": 22.96525764465332, "learning_rate": 2.5237564819010223e-06, "loss": 2.2686, "step": 4536500 }, { "epoch": 9.49735405432562, "grad_norm": 19.478553771972656, "learning_rate": 2.5185122254107304e-06, "loss": 2.2773, "step": 4537000 }, { "epoch": 9.498400710051245, "grad_norm": 18.810884475708008, "learning_rate": 2.5132679689204385e-06, "loss": 2.3046, "step": 4537500 }, { "epoch": 9.49944736577687, "grad_norm": 21.882200241088867, "learning_rate": 2.5080237124301466e-06, "loss": 2.2857, "step": 4538000 }, { "epoch": 9.500494021502496, "grad_norm": 25.107284545898438, "learning_rate": 2.5027794559398547e-06, "loss": 2.2706, "step": 4538500 }, { "epoch": 9.501540677228121, "grad_norm": 22.442703247070312, "learning_rate": 2.497535199449563e-06, "loss": 2.2718, "step": 4539000 }, { "epoch": 9.502587332953746, "grad_norm": 20.45121192932129, "learning_rate": 2.492290942959271e-06, "loss": 2.2618, "step": 4539500 }, { "epoch": 9.503633988679372, "grad_norm": 22.00715446472168, "learning_rate": 2.4870466864689795e-06, "loss": 2.2799, "step": 4540000 }, { "epoch": 9.504680644404997, "grad_norm": 20.0579891204834, "learning_rate": 2.4818024299786876e-06, "loss": 2.2855, "step": 4540500 }, { "epoch": 9.505727300130623, "grad_norm": 14.918527603149414, "learning_rate": 2.4765581734883957e-06, "loss": 2.2725, "step": 4541000 }, { "epoch": 9.506773955856248, "grad_norm": 21.62523651123047, "learning_rate": 2.471313916998104e-06, "loss": 2.2847, "step": 4541500 }, { "epoch": 9.507820611581874, "grad_norm": 17.487836837768555, "learning_rate": 2.466069660507812e-06, "loss": 2.2783, "step": 4542000 }, { "epoch": 9.5088672673075, "grad_norm": 23.862579345703125, "learning_rate": 2.46082540401752e-06, "loss": 2.2611, "step": 4542500 }, { "epoch": 9.509913923033125, "grad_norm": 20.21156883239746, "learning_rate": 2.455581147527228e-06, "loss": 2.2697, "step": 4543000 }, { "epoch": 9.51096057875875, "grad_norm": 24.41371726989746, "learning_rate": 2.4503368910369367e-06, "loss": 2.2698, "step": 4543500 }, { "epoch": 9.512007234484376, "grad_norm": 20.86312484741211, "learning_rate": 2.4450926345466444e-06, "loss": 2.2978, "step": 4544000 }, { "epoch": 9.513053890210001, "grad_norm": 19.12100601196289, "learning_rate": 2.4398483780563525e-06, "loss": 2.258, "step": 4544500 }, { "epoch": 9.514100545935626, "grad_norm": 21.110584259033203, "learning_rate": 2.434604121566061e-06, "loss": 2.2704, "step": 4545000 }, { "epoch": 9.515147201661252, "grad_norm": 20.321636199951172, "learning_rate": 2.429359865075769e-06, "loss": 2.2801, "step": 4545500 }, { "epoch": 9.516193857386877, "grad_norm": 23.72125244140625, "learning_rate": 2.4241156085854773e-06, "loss": 2.2793, "step": 4546000 }, { "epoch": 9.517240513112503, "grad_norm": 26.78066062927246, "learning_rate": 2.4188713520951854e-06, "loss": 2.2539, "step": 4546500 }, { "epoch": 9.518287168838128, "grad_norm": 19.45515251159668, "learning_rate": 2.4136270956048935e-06, "loss": 2.2789, "step": 4547000 }, { "epoch": 9.519333824563754, "grad_norm": 20.349693298339844, "learning_rate": 2.4083828391146017e-06, "loss": 2.2683, "step": 4547500 }, { "epoch": 9.52038048028938, "grad_norm": 19.463817596435547, "learning_rate": 2.4031385826243098e-06, "loss": 2.2564, "step": 4548000 }, { "epoch": 9.521427136015005, "grad_norm": 20.823192596435547, "learning_rate": 2.3978943261340183e-06, "loss": 2.2579, "step": 4548500 }, { "epoch": 9.52247379174063, "grad_norm": 19.449644088745117, "learning_rate": 2.3926500696437264e-06, "loss": 2.2865, "step": 4549000 }, { "epoch": 9.523520447466256, "grad_norm": 23.676288604736328, "learning_rate": 2.3874058131534345e-06, "loss": 2.2782, "step": 4549500 }, { "epoch": 9.524567103191881, "grad_norm": 18.626474380493164, "learning_rate": 2.3821615566631426e-06, "loss": 2.2745, "step": 4550000 }, { "epoch": 9.525613758917507, "grad_norm": 26.166418075561523, "learning_rate": 2.3769173001728508e-06, "loss": 2.2646, "step": 4550500 }, { "epoch": 9.526660414643132, "grad_norm": 25.000951766967773, "learning_rate": 2.371673043682559e-06, "loss": 2.2866, "step": 4551000 }, { "epoch": 9.527707070368757, "grad_norm": 19.12917709350586, "learning_rate": 2.366428787192267e-06, "loss": 2.2694, "step": 4551500 }, { "epoch": 9.528753726094383, "grad_norm": 22.443452835083008, "learning_rate": 2.3611845307019755e-06, "loss": 2.2819, "step": 4552000 }, { "epoch": 9.529800381820008, "grad_norm": 20.338029861450195, "learning_rate": 2.3559402742116836e-06, "loss": 2.2726, "step": 4552500 }, { "epoch": 9.530847037545634, "grad_norm": 23.93083381652832, "learning_rate": 2.3506960177213913e-06, "loss": 2.2792, "step": 4553000 }, { "epoch": 9.53189369327126, "grad_norm": 17.530248641967773, "learning_rate": 2.3454517612311e-06, "loss": 2.264, "step": 4553500 }, { "epoch": 9.532940348996885, "grad_norm": 20.878511428833008, "learning_rate": 2.340207504740808e-06, "loss": 2.2716, "step": 4554000 }, { "epoch": 9.53398700472251, "grad_norm": 18.791120529174805, "learning_rate": 2.334963248250516e-06, "loss": 2.2763, "step": 4554500 }, { "epoch": 9.535033660448136, "grad_norm": 18.84556007385254, "learning_rate": 2.3297189917602246e-06, "loss": 2.2664, "step": 4555000 }, { "epoch": 9.536080316173761, "grad_norm": 21.630401611328125, "learning_rate": 2.3244747352699323e-06, "loss": 2.2809, "step": 4555500 }, { "epoch": 9.537126971899387, "grad_norm": 20.054380416870117, "learning_rate": 2.3192304787796404e-06, "loss": 2.2683, "step": 4556000 }, { "epoch": 9.538173627625012, "grad_norm": 19.25931167602539, "learning_rate": 2.3139862222893486e-06, "loss": 2.2756, "step": 4556500 }, { "epoch": 9.539220283350637, "grad_norm": 16.8867244720459, "learning_rate": 2.308741965799057e-06, "loss": 2.2773, "step": 4557000 }, { "epoch": 9.540266939076263, "grad_norm": 20.308895111083984, "learning_rate": 2.3034977093087652e-06, "loss": 2.2756, "step": 4557500 }, { "epoch": 9.541313594801888, "grad_norm": 20.178659439086914, "learning_rate": 2.2982534528184733e-06, "loss": 2.288, "step": 4558000 }, { "epoch": 9.542360250527514, "grad_norm": 22.818222045898438, "learning_rate": 2.2930091963281814e-06, "loss": 2.2615, "step": 4558500 }, { "epoch": 9.54340690625314, "grad_norm": 16.208757400512695, "learning_rate": 2.2877649398378896e-06, "loss": 2.2562, "step": 4559000 }, { "epoch": 9.544453561978765, "grad_norm": 17.418994903564453, "learning_rate": 2.2825206833475977e-06, "loss": 2.2769, "step": 4559500 }, { "epoch": 9.54550021770439, "grad_norm": 20.641773223876953, "learning_rate": 2.2772764268573062e-06, "loss": 2.2825, "step": 4560000 }, { "epoch": 9.546546873430016, "grad_norm": 25.362422943115234, "learning_rate": 2.2720321703670143e-06, "loss": 2.2675, "step": 4560500 }, { "epoch": 9.547593529155641, "grad_norm": 21.00766944885254, "learning_rate": 2.2667879138767224e-06, "loss": 2.2708, "step": 4561000 }, { "epoch": 9.548640184881268, "grad_norm": 18.293546676635742, "learning_rate": 2.2615436573864306e-06, "loss": 2.2787, "step": 4561500 }, { "epoch": 9.549686840606892, "grad_norm": 20.202043533325195, "learning_rate": 2.2562994008961387e-06, "loss": 2.2478, "step": 4562000 }, { "epoch": 9.55073349633252, "grad_norm": 21.155582427978516, "learning_rate": 2.2510551444058468e-06, "loss": 2.2684, "step": 4562500 }, { "epoch": 9.551780152058143, "grad_norm": 18.677705764770508, "learning_rate": 2.245810887915555e-06, "loss": 2.2728, "step": 4563000 }, { "epoch": 9.55282680778377, "grad_norm": 31.61868667602539, "learning_rate": 2.2405666314252634e-06, "loss": 2.2791, "step": 4563500 }, { "epoch": 9.553873463509396, "grad_norm": 21.19964027404785, "learning_rate": 2.2353223749349716e-06, "loss": 2.2666, "step": 4564000 }, { "epoch": 9.554920119235021, "grad_norm": 18.420812606811523, "learning_rate": 2.2300781184446792e-06, "loss": 2.2741, "step": 4564500 }, { "epoch": 9.555966774960647, "grad_norm": 25.12285804748535, "learning_rate": 2.2248338619543874e-06, "loss": 2.2732, "step": 4565000 }, { "epoch": 9.557013430686272, "grad_norm": 21.66007423400879, "learning_rate": 2.219589605464096e-06, "loss": 2.2732, "step": 4565500 }, { "epoch": 9.558060086411897, "grad_norm": 21.45775032043457, "learning_rate": 2.214345348973804e-06, "loss": 2.2804, "step": 4566000 }, { "epoch": 9.559106742137523, "grad_norm": 20.305923461914062, "learning_rate": 2.209101092483512e-06, "loss": 2.2688, "step": 4566500 }, { "epoch": 9.560153397863148, "grad_norm": 19.19967269897461, "learning_rate": 2.2038568359932207e-06, "loss": 2.2631, "step": 4567000 }, { "epoch": 9.561200053588774, "grad_norm": 20.473934173583984, "learning_rate": 2.1986125795029284e-06, "loss": 2.2786, "step": 4567500 }, { "epoch": 9.5622467093144, "grad_norm": 19.875947952270508, "learning_rate": 2.1933683230126365e-06, "loss": 2.2635, "step": 4568000 }, { "epoch": 9.563293365040025, "grad_norm": 24.62201499938965, "learning_rate": 2.188124066522345e-06, "loss": 2.2651, "step": 4568500 }, { "epoch": 9.56434002076565, "grad_norm": 20.633623123168945, "learning_rate": 2.182879810032053e-06, "loss": 2.2764, "step": 4569000 }, { "epoch": 9.565386676491276, "grad_norm": 20.71738624572754, "learning_rate": 2.1776355535417612e-06, "loss": 2.2626, "step": 4569500 }, { "epoch": 9.566433332216901, "grad_norm": 20.837997436523438, "learning_rate": 2.1723912970514694e-06, "loss": 2.2605, "step": 4570000 }, { "epoch": 9.567479987942527, "grad_norm": 20.824207305908203, "learning_rate": 2.1671470405611775e-06, "loss": 2.278, "step": 4570500 }, { "epoch": 9.568526643668152, "grad_norm": 18.94120216369629, "learning_rate": 2.1619027840708856e-06, "loss": 2.257, "step": 4571000 }, { "epoch": 9.569573299393777, "grad_norm": 19.5897274017334, "learning_rate": 2.1566585275805937e-06, "loss": 2.2875, "step": 4571500 }, { "epoch": 9.570619955119403, "grad_norm": 17.992218017578125, "learning_rate": 2.1514142710903022e-06, "loss": 2.2712, "step": 4572000 }, { "epoch": 9.571666610845028, "grad_norm": 21.84581756591797, "learning_rate": 2.1461700146000104e-06, "loss": 2.2642, "step": 4572500 }, { "epoch": 9.572713266570654, "grad_norm": 21.36202049255371, "learning_rate": 2.1409257581097185e-06, "loss": 2.2762, "step": 4573000 }, { "epoch": 9.57375992229628, "grad_norm": 19.965431213378906, "learning_rate": 2.135681501619426e-06, "loss": 2.2458, "step": 4573500 }, { "epoch": 9.574806578021905, "grad_norm": 18.541173934936523, "learning_rate": 2.1304372451291347e-06, "loss": 2.2584, "step": 4574000 }, { "epoch": 9.57585323374753, "grad_norm": 19.824636459350586, "learning_rate": 2.125192988638843e-06, "loss": 2.2722, "step": 4574500 }, { "epoch": 9.576899889473156, "grad_norm": 23.398815155029297, "learning_rate": 2.119948732148551e-06, "loss": 2.3117, "step": 4575000 }, { "epoch": 9.577946545198781, "grad_norm": 20.232593536376953, "learning_rate": 2.1147044756582595e-06, "loss": 2.285, "step": 4575500 }, { "epoch": 9.578993200924407, "grad_norm": 20.69916534423828, "learning_rate": 2.109460219167967e-06, "loss": 2.2766, "step": 4576000 }, { "epoch": 9.580039856650032, "grad_norm": 17.36343765258789, "learning_rate": 2.1042159626776753e-06, "loss": 2.2755, "step": 4576500 }, { "epoch": 9.581086512375657, "grad_norm": 21.261146545410156, "learning_rate": 2.098971706187384e-06, "loss": 2.2581, "step": 4577000 }, { "epoch": 9.582133168101283, "grad_norm": 17.16051483154297, "learning_rate": 2.093727449697092e-06, "loss": 2.2584, "step": 4577500 }, { "epoch": 9.583179823826908, "grad_norm": 21.878416061401367, "learning_rate": 2.0884831932068e-06, "loss": 2.2822, "step": 4578000 }, { "epoch": 9.584226479552534, "grad_norm": 23.767728805541992, "learning_rate": 2.083238936716508e-06, "loss": 2.2517, "step": 4578500 }, { "epoch": 9.58527313527816, "grad_norm": 23.626468658447266, "learning_rate": 2.0779946802262163e-06, "loss": 2.2824, "step": 4579000 }, { "epoch": 9.586319791003785, "grad_norm": 17.099878311157227, "learning_rate": 2.0727504237359244e-06, "loss": 2.2581, "step": 4579500 }, { "epoch": 9.58736644672941, "grad_norm": 17.2652587890625, "learning_rate": 2.0675061672456325e-06, "loss": 2.2823, "step": 4580000 }, { "epoch": 9.588413102455036, "grad_norm": 17.410804748535156, "learning_rate": 2.062261910755341e-06, "loss": 2.2671, "step": 4580500 }, { "epoch": 9.589459758180661, "grad_norm": 18.105899810791016, "learning_rate": 2.057017654265049e-06, "loss": 2.2697, "step": 4581000 }, { "epoch": 9.590506413906287, "grad_norm": 19.031164169311523, "learning_rate": 2.0517733977747573e-06, "loss": 2.2784, "step": 4581500 }, { "epoch": 9.591553069631912, "grad_norm": 17.636388778686523, "learning_rate": 2.0465291412844654e-06, "loss": 2.2775, "step": 4582000 }, { "epoch": 9.592599725357537, "grad_norm": 25.37812614440918, "learning_rate": 2.0412848847941735e-06, "loss": 2.2694, "step": 4582500 }, { "epoch": 9.593646381083163, "grad_norm": 17.73613929748535, "learning_rate": 2.0360406283038816e-06, "loss": 2.2776, "step": 4583000 }, { "epoch": 9.594693036808788, "grad_norm": 21.319881439208984, "learning_rate": 2.0307963718135897e-06, "loss": 2.2637, "step": 4583500 }, { "epoch": 9.595739692534414, "grad_norm": 23.69371795654297, "learning_rate": 2.0255521153232983e-06, "loss": 2.2732, "step": 4584000 }, { "epoch": 9.59678634826004, "grad_norm": 39.24587631225586, "learning_rate": 2.0203078588330064e-06, "loss": 2.2795, "step": 4584500 }, { "epoch": 9.597833003985665, "grad_norm": 18.807220458984375, "learning_rate": 2.015063602342714e-06, "loss": 2.2687, "step": 4585000 }, { "epoch": 9.59887965971129, "grad_norm": 18.252790451049805, "learning_rate": 2.0098193458524226e-06, "loss": 2.2634, "step": 4585500 }, { "epoch": 9.599926315436916, "grad_norm": 19.83814239501953, "learning_rate": 2.0045750893621307e-06, "loss": 2.2757, "step": 4586000 }, { "epoch": 9.600972971162541, "grad_norm": 22.672771453857422, "learning_rate": 1.999330832871839e-06, "loss": 2.2914, "step": 4586500 }, { "epoch": 9.602019626888167, "grad_norm": 19.88022232055664, "learning_rate": 1.994086576381547e-06, "loss": 2.2814, "step": 4587000 }, { "epoch": 9.603066282613792, "grad_norm": 16.217487335205078, "learning_rate": 1.9888423198912555e-06, "loss": 2.2665, "step": 4587500 }, { "epoch": 9.604112938339417, "grad_norm": 19.228864669799805, "learning_rate": 1.983598063400963e-06, "loss": 2.2729, "step": 4588000 }, { "epoch": 9.605159594065043, "grad_norm": 20.09279441833496, "learning_rate": 1.9783538069106713e-06, "loss": 2.279, "step": 4588500 }, { "epoch": 9.606206249790668, "grad_norm": 19.322021484375, "learning_rate": 1.97310955042038e-06, "loss": 2.2726, "step": 4589000 }, { "epoch": 9.607252905516294, "grad_norm": 20.446001052856445, "learning_rate": 1.967865293930088e-06, "loss": 2.2678, "step": 4589500 }, { "epoch": 9.60829956124192, "grad_norm": 21.368345260620117, "learning_rate": 1.962621037439796e-06, "loss": 2.2748, "step": 4590000 }, { "epoch": 9.609346216967545, "grad_norm": 21.559879302978516, "learning_rate": 1.957376780949504e-06, "loss": 2.2646, "step": 4590500 }, { "epoch": 9.61039287269317, "grad_norm": 19.00459098815918, "learning_rate": 1.9521325244592123e-06, "loss": 2.2725, "step": 4591000 }, { "epoch": 9.611439528418796, "grad_norm": 17.96255874633789, "learning_rate": 1.9468882679689204e-06, "loss": 2.2676, "step": 4591500 }, { "epoch": 9.612486184144421, "grad_norm": 21.355541229248047, "learning_rate": 1.9416440114786285e-06, "loss": 2.261, "step": 4592000 }, { "epoch": 9.613532839870047, "grad_norm": 20.047399520874023, "learning_rate": 1.936399754988337e-06, "loss": 2.288, "step": 4592500 }, { "epoch": 9.614579495595672, "grad_norm": 20.780712127685547, "learning_rate": 1.931155498498045e-06, "loss": 2.2664, "step": 4593000 }, { "epoch": 9.615626151321297, "grad_norm": 18.950786590576172, "learning_rate": 1.9259112420077533e-06, "loss": 2.2585, "step": 4593500 }, { "epoch": 9.616672807046923, "grad_norm": 19.839412689208984, "learning_rate": 1.9206669855174614e-06, "loss": 2.2659, "step": 4594000 }, { "epoch": 9.617719462772548, "grad_norm": 21.39126968383789, "learning_rate": 1.9154227290271695e-06, "loss": 2.2635, "step": 4594500 }, { "epoch": 9.618766118498174, "grad_norm": 21.271644592285156, "learning_rate": 1.9101784725368776e-06, "loss": 2.2588, "step": 4595000 }, { "epoch": 9.6198127742238, "grad_norm": 18.098072052001953, "learning_rate": 1.9049342160465857e-06, "loss": 2.2664, "step": 4595500 }, { "epoch": 9.620859429949425, "grad_norm": 20.880420684814453, "learning_rate": 1.899689959556294e-06, "loss": 2.261, "step": 4596000 }, { "epoch": 9.62190608567505, "grad_norm": 21.843069076538086, "learning_rate": 1.8944457030660022e-06, "loss": 2.2588, "step": 4596500 }, { "epoch": 9.622952741400677, "grad_norm": 23.08814811706543, "learning_rate": 1.8892014465757103e-06, "loss": 2.2745, "step": 4597000 }, { "epoch": 9.623999397126301, "grad_norm": 21.010473251342773, "learning_rate": 1.8839571900854186e-06, "loss": 2.2664, "step": 4597500 }, { "epoch": 9.625046052851928, "grad_norm": 18.924510955810547, "learning_rate": 1.8787129335951267e-06, "loss": 2.257, "step": 4598000 }, { "epoch": 9.626092708577552, "grad_norm": 18.528812408447266, "learning_rate": 1.8734686771048349e-06, "loss": 2.2654, "step": 4598500 }, { "epoch": 9.62713936430318, "grad_norm": 20.45435905456543, "learning_rate": 1.8682244206145432e-06, "loss": 2.2762, "step": 4599000 }, { "epoch": 9.628186020028805, "grad_norm": 19.988840103149414, "learning_rate": 1.8629801641242513e-06, "loss": 2.2765, "step": 4599500 }, { "epoch": 9.62923267575443, "grad_norm": 26.283950805664062, "learning_rate": 1.8577359076339594e-06, "loss": 2.2692, "step": 4600000 }, { "epoch": 9.630279331480056, "grad_norm": 15.674524307250977, "learning_rate": 1.8524916511436673e-06, "loss": 2.2623, "step": 4600500 }, { "epoch": 9.631325987205681, "grad_norm": 18.648822784423828, "learning_rate": 1.8472473946533759e-06, "loss": 2.2627, "step": 4601000 }, { "epoch": 9.632372642931307, "grad_norm": 20.938621520996094, "learning_rate": 1.842003138163084e-06, "loss": 2.258, "step": 4601500 }, { "epoch": 9.633419298656932, "grad_norm": 21.61751365661621, "learning_rate": 1.8367588816727919e-06, "loss": 2.271, "step": 4602000 }, { "epoch": 9.634465954382557, "grad_norm": 19.55546760559082, "learning_rate": 1.8315146251825004e-06, "loss": 2.2748, "step": 4602500 }, { "epoch": 9.635512610108183, "grad_norm": 18.9344482421875, "learning_rate": 1.8262703686922085e-06, "loss": 2.2604, "step": 4603000 }, { "epoch": 9.636559265833808, "grad_norm": 20.85150718688965, "learning_rate": 1.8210261122019164e-06, "loss": 2.2723, "step": 4603500 }, { "epoch": 9.637605921559434, "grad_norm": 19.157245635986328, "learning_rate": 1.8157818557116245e-06, "loss": 2.2745, "step": 4604000 }, { "epoch": 9.63865257728506, "grad_norm": 21.30376625061035, "learning_rate": 1.8105375992213329e-06, "loss": 2.2702, "step": 4604500 }, { "epoch": 9.639699233010685, "grad_norm": 18.351104736328125, "learning_rate": 1.805293342731041e-06, "loss": 2.2464, "step": 4605000 }, { "epoch": 9.64074588873631, "grad_norm": 18.284879684448242, "learning_rate": 1.800049086240749e-06, "loss": 2.2879, "step": 4605500 }, { "epoch": 9.641792544461936, "grad_norm": 23.663129806518555, "learning_rate": 1.7948048297504574e-06, "loss": 2.2584, "step": 4606000 }, { "epoch": 9.642839200187561, "grad_norm": 18.101530075073242, "learning_rate": 1.7895605732601655e-06, "loss": 2.2636, "step": 4606500 }, { "epoch": 9.643885855913187, "grad_norm": 20.80944061279297, "learning_rate": 1.7843163167698737e-06, "loss": 2.271, "step": 4607000 }, { "epoch": 9.644932511638812, "grad_norm": 18.505475997924805, "learning_rate": 1.779072060279582e-06, "loss": 2.2699, "step": 4607500 }, { "epoch": 9.645979167364438, "grad_norm": 20.36873435974121, "learning_rate": 1.77382780378929e-06, "loss": 2.2787, "step": 4608000 }, { "epoch": 9.647025823090063, "grad_norm": 20.514877319335938, "learning_rate": 1.7685835472989982e-06, "loss": 2.2599, "step": 4608500 }, { "epoch": 9.648072478815688, "grad_norm": 21.289306640625, "learning_rate": 1.7633392908087063e-06, "loss": 2.2722, "step": 4609000 }, { "epoch": 9.649119134541314, "grad_norm": 16.772306442260742, "learning_rate": 1.7580950343184147e-06, "loss": 2.2616, "step": 4609500 }, { "epoch": 9.65016579026694, "grad_norm": 19.946918487548828, "learning_rate": 1.7528507778281228e-06, "loss": 2.2728, "step": 4610000 }, { "epoch": 9.651212445992565, "grad_norm": 20.50033950805664, "learning_rate": 1.7476065213378309e-06, "loss": 2.2658, "step": 4610500 }, { "epoch": 9.65225910171819, "grad_norm": 26.15624237060547, "learning_rate": 1.7423622648475392e-06, "loss": 2.258, "step": 4611000 }, { "epoch": 9.653305757443816, "grad_norm": 18.639184951782227, "learning_rate": 1.7371180083572473e-06, "loss": 2.2718, "step": 4611500 }, { "epoch": 9.654352413169441, "grad_norm": 19.5877742767334, "learning_rate": 1.7318737518669552e-06, "loss": 2.273, "step": 4612000 }, { "epoch": 9.655399068895067, "grad_norm": 23.636476516723633, "learning_rate": 1.7266294953766633e-06, "loss": 2.2527, "step": 4612500 }, { "epoch": 9.656445724620692, "grad_norm": 21.6992130279541, "learning_rate": 1.7213852388863719e-06, "loss": 2.2657, "step": 4613000 }, { "epoch": 9.657492380346318, "grad_norm": 19.750499725341797, "learning_rate": 1.7161409823960798e-06, "loss": 2.2749, "step": 4613500 }, { "epoch": 9.658539036071943, "grad_norm": 21.574369430541992, "learning_rate": 1.710896725905788e-06, "loss": 2.244, "step": 4614000 }, { "epoch": 9.659585691797568, "grad_norm": 21.444528579711914, "learning_rate": 1.7056524694154964e-06, "loss": 2.2663, "step": 4614500 }, { "epoch": 9.660632347523194, "grad_norm": 17.073179244995117, "learning_rate": 1.7004082129252043e-06, "loss": 2.2717, "step": 4615000 }, { "epoch": 9.66167900324882, "grad_norm": 18.753917694091797, "learning_rate": 1.6951639564349125e-06, "loss": 2.2746, "step": 4615500 }, { "epoch": 9.662725658974445, "grad_norm": 24.06033706665039, "learning_rate": 1.689919699944621e-06, "loss": 2.2652, "step": 4616000 }, { "epoch": 9.66377231470007, "grad_norm": 18.839231491088867, "learning_rate": 1.6846754434543289e-06, "loss": 2.2674, "step": 4616500 }, { "epoch": 9.664818970425696, "grad_norm": 24.845252990722656, "learning_rate": 1.679431186964037e-06, "loss": 2.2772, "step": 4617000 }, { "epoch": 9.665865626151321, "grad_norm": 20.438278198242188, "learning_rate": 1.6741869304737451e-06, "loss": 2.2522, "step": 4617500 }, { "epoch": 9.666912281876947, "grad_norm": 22.34147834777832, "learning_rate": 1.6689426739834534e-06, "loss": 2.2724, "step": 4618000 }, { "epoch": 9.667958937602572, "grad_norm": 19.096899032592773, "learning_rate": 1.6636984174931616e-06, "loss": 2.2601, "step": 4618500 }, { "epoch": 9.669005593328198, "grad_norm": 19.129587173461914, "learning_rate": 1.6584541610028697e-06, "loss": 2.2857, "step": 4619000 }, { "epoch": 9.670052249053823, "grad_norm": 20.630773544311523, "learning_rate": 1.653209904512578e-06, "loss": 2.2557, "step": 4619500 }, { "epoch": 9.671098904779448, "grad_norm": 15.782438278198242, "learning_rate": 1.6479656480222861e-06, "loss": 2.275, "step": 4620000 }, { "epoch": 9.672145560505074, "grad_norm": 17.64104652404785, "learning_rate": 1.6427213915319942e-06, "loss": 2.2788, "step": 4620500 }, { "epoch": 9.6731922162307, "grad_norm": 21.22057342529297, "learning_rate": 1.6374771350417021e-06, "loss": 2.2615, "step": 4621000 }, { "epoch": 9.674238871956325, "grad_norm": 17.96949577331543, "learning_rate": 1.6322328785514107e-06, "loss": 2.2602, "step": 4621500 }, { "epoch": 9.67528552768195, "grad_norm": 22.7465877532959, "learning_rate": 1.6269886220611188e-06, "loss": 2.2647, "step": 4622000 }, { "epoch": 9.676332183407576, "grad_norm": 19.95114517211914, "learning_rate": 1.6217443655708267e-06, "loss": 2.28, "step": 4622500 }, { "epoch": 9.677378839133201, "grad_norm": 21.23027229309082, "learning_rate": 1.6165001090805352e-06, "loss": 2.2803, "step": 4623000 }, { "epoch": 9.678425494858827, "grad_norm": 18.576265335083008, "learning_rate": 1.6112558525902433e-06, "loss": 2.253, "step": 4623500 }, { "epoch": 9.679472150584452, "grad_norm": 27.218311309814453, "learning_rate": 1.6060115960999512e-06, "loss": 2.2622, "step": 4624000 }, { "epoch": 9.680518806310078, "grad_norm": 22.246843338012695, "learning_rate": 1.6007673396096598e-06, "loss": 2.2599, "step": 4624500 }, { "epoch": 9.681565462035703, "grad_norm": 21.379169464111328, "learning_rate": 1.5955230831193677e-06, "loss": 2.2681, "step": 4625000 }, { "epoch": 9.682612117761328, "grad_norm": 17.537837982177734, "learning_rate": 1.5902788266290758e-06, "loss": 2.2766, "step": 4625500 }, { "epoch": 9.683658773486954, "grad_norm": 23.090381622314453, "learning_rate": 1.585034570138784e-06, "loss": 2.2739, "step": 4626000 }, { "epoch": 9.68470542921258, "grad_norm": 24.438875198364258, "learning_rate": 1.5797903136484922e-06, "loss": 2.2555, "step": 4626500 }, { "epoch": 9.685752084938205, "grad_norm": 18.101699829101562, "learning_rate": 1.5745460571582004e-06, "loss": 2.2719, "step": 4627000 }, { "epoch": 9.68679874066383, "grad_norm": 21.367053985595703, "learning_rate": 1.5693018006679085e-06, "loss": 2.251, "step": 4627500 }, { "epoch": 9.687845396389456, "grad_norm": 19.421667098999023, "learning_rate": 1.5640575441776168e-06, "loss": 2.2568, "step": 4628000 }, { "epoch": 9.688892052115081, "grad_norm": 18.4425106048584, "learning_rate": 1.558813287687325e-06, "loss": 2.2833, "step": 4628500 }, { "epoch": 9.689938707840707, "grad_norm": 19.38677215576172, "learning_rate": 1.553569031197033e-06, "loss": 2.2431, "step": 4629000 }, { "epoch": 9.690985363566332, "grad_norm": 21.877134323120117, "learning_rate": 1.5483247747067411e-06, "loss": 2.2647, "step": 4629500 }, { "epoch": 9.692032019291958, "grad_norm": 24.561725616455078, "learning_rate": 1.5430805182164495e-06, "loss": 2.2732, "step": 4630000 }, { "epoch": 9.693078675017583, "grad_norm": 21.281295776367188, "learning_rate": 1.5378362617261576e-06, "loss": 2.2489, "step": 4630500 }, { "epoch": 9.694125330743208, "grad_norm": 19.040437698364258, "learning_rate": 1.5325920052358657e-06, "loss": 2.2756, "step": 4631000 }, { "epoch": 9.695171986468836, "grad_norm": 19.153160095214844, "learning_rate": 1.5273477487455738e-06, "loss": 2.2671, "step": 4631500 }, { "epoch": 9.69621864219446, "grad_norm": 31.139850616455078, "learning_rate": 1.5221034922552821e-06, "loss": 2.2383, "step": 4632000 }, { "epoch": 9.697265297920087, "grad_norm": 19.704050064086914, "learning_rate": 1.5168592357649903e-06, "loss": 2.2976, "step": 4632500 }, { "epoch": 9.69831195364571, "grad_norm": 19.341081619262695, "learning_rate": 1.5116149792746984e-06, "loss": 2.2442, "step": 4633000 }, { "epoch": 9.699358609371338, "grad_norm": 19.056734085083008, "learning_rate": 1.5063707227844067e-06, "loss": 2.2732, "step": 4633500 }, { "epoch": 9.700405265096963, "grad_norm": 19.658971786499023, "learning_rate": 1.5011264662941146e-06, "loss": 2.2501, "step": 4634000 }, { "epoch": 9.701451920822588, "grad_norm": 18.57846450805664, "learning_rate": 1.495882209803823e-06, "loss": 2.2427, "step": 4634500 }, { "epoch": 9.702498576548214, "grad_norm": 21.34816551208496, "learning_rate": 1.4906379533135313e-06, "loss": 2.2675, "step": 4635000 }, { "epoch": 9.70354523227384, "grad_norm": 20.195999145507812, "learning_rate": 1.4853936968232392e-06, "loss": 2.2771, "step": 4635500 }, { "epoch": 9.704591887999465, "grad_norm": 20.563365936279297, "learning_rate": 1.4801494403329475e-06, "loss": 2.2617, "step": 4636000 }, { "epoch": 9.70563854372509, "grad_norm": 16.806808471679688, "learning_rate": 1.4749051838426556e-06, "loss": 2.2628, "step": 4636500 }, { "epoch": 9.706685199450716, "grad_norm": 20.155080795288086, "learning_rate": 1.4696609273523637e-06, "loss": 2.2425, "step": 4637000 }, { "epoch": 9.707731855176341, "grad_norm": 19.41500473022461, "learning_rate": 1.4644166708620718e-06, "loss": 2.2576, "step": 4637500 }, { "epoch": 9.708778510901967, "grad_norm": 19.011343002319336, "learning_rate": 1.4591724143717802e-06, "loss": 2.2495, "step": 4638000 }, { "epoch": 9.709825166627592, "grad_norm": 20.049943923950195, "learning_rate": 1.4539281578814883e-06, "loss": 2.261, "step": 4638500 }, { "epoch": 9.710871822353218, "grad_norm": 24.377567291259766, "learning_rate": 1.4486839013911964e-06, "loss": 2.2596, "step": 4639000 }, { "epoch": 9.711918478078843, "grad_norm": 19.364458084106445, "learning_rate": 1.4434396449009047e-06, "loss": 2.2718, "step": 4639500 }, { "epoch": 9.712965133804468, "grad_norm": 20.513986587524414, "learning_rate": 1.4381953884106126e-06, "loss": 2.2765, "step": 4640000 }, { "epoch": 9.714011789530094, "grad_norm": 21.16796875, "learning_rate": 1.432951131920321e-06, "loss": 2.2735, "step": 4640500 }, { "epoch": 9.71505844525572, "grad_norm": 18.122037887573242, "learning_rate": 1.4277068754300293e-06, "loss": 2.2611, "step": 4641000 }, { "epoch": 9.716105100981345, "grad_norm": 20.214448928833008, "learning_rate": 1.4224626189397372e-06, "loss": 2.27, "step": 4641500 }, { "epoch": 9.71715175670697, "grad_norm": 19.869873046875, "learning_rate": 1.4172183624494455e-06, "loss": 2.2625, "step": 4642000 }, { "epoch": 9.718198412432596, "grad_norm": 19.527048110961914, "learning_rate": 1.4119741059591536e-06, "loss": 2.2802, "step": 4642500 }, { "epoch": 9.719245068158221, "grad_norm": 22.290584564208984, "learning_rate": 1.4067298494688617e-06, "loss": 2.2634, "step": 4643000 }, { "epoch": 9.720291723883847, "grad_norm": 17.919496536254883, "learning_rate": 1.40148559297857e-06, "loss": 2.2499, "step": 4643500 }, { "epoch": 9.721338379609472, "grad_norm": 16.69318199157715, "learning_rate": 1.3962413364882782e-06, "loss": 2.2681, "step": 4644000 }, { "epoch": 9.722385035335098, "grad_norm": 15.61162281036377, "learning_rate": 1.3909970799979863e-06, "loss": 2.2592, "step": 4644500 }, { "epoch": 9.723431691060723, "grad_norm": 18.698814392089844, "learning_rate": 1.3857528235076944e-06, "loss": 2.2657, "step": 4645000 }, { "epoch": 9.724478346786348, "grad_norm": 22.830522537231445, "learning_rate": 1.3805085670174027e-06, "loss": 2.2613, "step": 4645500 }, { "epoch": 9.725525002511974, "grad_norm": 18.554994583129883, "learning_rate": 1.3752643105271106e-06, "loss": 2.2698, "step": 4646000 }, { "epoch": 9.7265716582376, "grad_norm": 20.488677978515625, "learning_rate": 1.370020054036819e-06, "loss": 2.2487, "step": 4646500 }, { "epoch": 9.727618313963225, "grad_norm": 18.410388946533203, "learning_rate": 1.364775797546527e-06, "loss": 2.2686, "step": 4647000 }, { "epoch": 9.72866496968885, "grad_norm": 18.469032287597656, "learning_rate": 1.3595315410562352e-06, "loss": 2.2688, "step": 4647500 }, { "epoch": 9.729711625414476, "grad_norm": 19.0128116607666, "learning_rate": 1.3542872845659435e-06, "loss": 2.2584, "step": 4648000 }, { "epoch": 9.730758281140101, "grad_norm": 19.311214447021484, "learning_rate": 1.3490430280756516e-06, "loss": 2.2721, "step": 4648500 }, { "epoch": 9.731804936865727, "grad_norm": 17.182109832763672, "learning_rate": 1.3437987715853597e-06, "loss": 2.2544, "step": 4649000 }, { "epoch": 9.732851592591352, "grad_norm": 19.178194046020508, "learning_rate": 1.338554515095068e-06, "loss": 2.2523, "step": 4649500 }, { "epoch": 9.733898248316978, "grad_norm": 16.01047134399414, "learning_rate": 1.3333102586047762e-06, "loss": 2.2734, "step": 4650000 }, { "epoch": 9.734944904042603, "grad_norm": 18.141687393188477, "learning_rate": 1.3280660021144843e-06, "loss": 2.2813, "step": 4650500 }, { "epoch": 9.735991559768228, "grad_norm": 21.396848678588867, "learning_rate": 1.3228217456241924e-06, "loss": 2.2737, "step": 4651000 }, { "epoch": 9.737038215493854, "grad_norm": 17.555252075195312, "learning_rate": 1.3175774891339005e-06, "loss": 2.2661, "step": 4651500 }, { "epoch": 9.73808487121948, "grad_norm": 19.597957611083984, "learning_rate": 1.3123332326436089e-06, "loss": 2.2632, "step": 4652000 }, { "epoch": 9.739131526945105, "grad_norm": 20.689817428588867, "learning_rate": 1.307088976153317e-06, "loss": 2.2619, "step": 4652500 }, { "epoch": 9.74017818267073, "grad_norm": 17.8238525390625, "learning_rate": 1.301844719663025e-06, "loss": 2.2605, "step": 4653000 }, { "epoch": 9.741224838396356, "grad_norm": 23.339635848999023, "learning_rate": 1.2966004631727332e-06, "loss": 2.2557, "step": 4653500 }, { "epoch": 9.742271494121981, "grad_norm": 20.44103240966797, "learning_rate": 1.2913562066824415e-06, "loss": 2.2938, "step": 4654000 }, { "epoch": 9.743318149847607, "grad_norm": 22.077123641967773, "learning_rate": 1.2861119501921496e-06, "loss": 2.2515, "step": 4654500 }, { "epoch": 9.744364805573232, "grad_norm": 18.98154067993164, "learning_rate": 1.2808676937018578e-06, "loss": 2.2666, "step": 4655000 }, { "epoch": 9.745411461298858, "grad_norm": 22.246822357177734, "learning_rate": 1.275623437211566e-06, "loss": 2.2633, "step": 4655500 }, { "epoch": 9.746458117024483, "grad_norm": 28.059127807617188, "learning_rate": 1.270379180721274e-06, "loss": 2.2674, "step": 4656000 }, { "epoch": 9.747504772750109, "grad_norm": 20.099864959716797, "learning_rate": 1.2651349242309823e-06, "loss": 2.2809, "step": 4656500 }, { "epoch": 9.748551428475734, "grad_norm": 22.678184509277344, "learning_rate": 1.2598906677406904e-06, "loss": 2.279, "step": 4657000 }, { "epoch": 9.74959808420136, "grad_norm": 20.10785484313965, "learning_rate": 1.2546464112503985e-06, "loss": 2.2541, "step": 4657500 }, { "epoch": 9.750644739926985, "grad_norm": 19.878751754760742, "learning_rate": 1.2494021547601069e-06, "loss": 2.2821, "step": 4658000 }, { "epoch": 9.75169139565261, "grad_norm": 23.992948532104492, "learning_rate": 1.244157898269815e-06, "loss": 2.2773, "step": 4658500 }, { "epoch": 9.752738051378236, "grad_norm": 19.51677131652832, "learning_rate": 1.238913641779523e-06, "loss": 2.2771, "step": 4659000 }, { "epoch": 9.753784707103861, "grad_norm": 21.441606521606445, "learning_rate": 1.2336693852892312e-06, "loss": 2.2429, "step": 4659500 }, { "epoch": 9.754831362829487, "grad_norm": 19.217138290405273, "learning_rate": 1.2284251287989395e-06, "loss": 2.2702, "step": 4660000 }, { "epoch": 9.755878018555112, "grad_norm": 26.455883026123047, "learning_rate": 1.2231808723086476e-06, "loss": 2.2621, "step": 4660500 }, { "epoch": 9.756924674280738, "grad_norm": 23.369915008544922, "learning_rate": 1.2179366158183558e-06, "loss": 2.2508, "step": 4661000 }, { "epoch": 9.757971330006363, "grad_norm": 22.671772003173828, "learning_rate": 1.212692359328064e-06, "loss": 2.2681, "step": 4661500 }, { "epoch": 9.759017985731989, "grad_norm": 19.700870513916016, "learning_rate": 1.207448102837772e-06, "loss": 2.2418, "step": 4662000 }, { "epoch": 9.760064641457614, "grad_norm": 22.379140853881836, "learning_rate": 1.2022038463474803e-06, "loss": 2.2688, "step": 4662500 }, { "epoch": 9.76111129718324, "grad_norm": 28.02945327758789, "learning_rate": 1.1969595898571886e-06, "loss": 2.2609, "step": 4663000 }, { "epoch": 9.762157952908865, "grad_norm": 17.991657257080078, "learning_rate": 1.1917153333668965e-06, "loss": 2.2436, "step": 4663500 }, { "epoch": 9.76320460863449, "grad_norm": 19.209760665893555, "learning_rate": 1.1864710768766049e-06, "loss": 2.2616, "step": 4664000 }, { "epoch": 9.764251264360116, "grad_norm": 20.215543746948242, "learning_rate": 1.181226820386313e-06, "loss": 2.2571, "step": 4664500 }, { "epoch": 9.765297920085741, "grad_norm": 22.035659790039062, "learning_rate": 1.175982563896021e-06, "loss": 2.2638, "step": 4665000 }, { "epoch": 9.766344575811367, "grad_norm": 19.512258529663086, "learning_rate": 1.1707383074057294e-06, "loss": 2.2487, "step": 4665500 }, { "epoch": 9.767391231536992, "grad_norm": 19.772171020507812, "learning_rate": 1.1654940509154375e-06, "loss": 2.2637, "step": 4666000 }, { "epoch": 9.768437887262618, "grad_norm": 18.739933013916016, "learning_rate": 1.1602497944251457e-06, "loss": 2.2535, "step": 4666500 }, { "epoch": 9.769484542988245, "grad_norm": 19.26297378540039, "learning_rate": 1.1550055379348538e-06, "loss": 2.2745, "step": 4667000 }, { "epoch": 9.770531198713869, "grad_norm": 26.22936248779297, "learning_rate": 1.1497612814445619e-06, "loss": 2.282, "step": 4667500 }, { "epoch": 9.771577854439496, "grad_norm": 19.254837036132812, "learning_rate": 1.14451702495427e-06, "loss": 2.2682, "step": 4668000 }, { "epoch": 9.772624510165121, "grad_norm": 17.498966217041016, "learning_rate": 1.1392727684639783e-06, "loss": 2.2503, "step": 4668500 }, { "epoch": 9.773671165890747, "grad_norm": 32.888206481933594, "learning_rate": 1.1340285119736864e-06, "loss": 2.2715, "step": 4669000 }, { "epoch": 9.774717821616372, "grad_norm": 24.409475326538086, "learning_rate": 1.1287842554833946e-06, "loss": 2.2535, "step": 4669500 }, { "epoch": 9.775764477341998, "grad_norm": 20.58638572692871, "learning_rate": 1.1235399989931029e-06, "loss": 2.2646, "step": 4670000 }, { "epoch": 9.776811133067623, "grad_norm": 26.828062057495117, "learning_rate": 1.118295742502811e-06, "loss": 2.2847, "step": 4670500 }, { "epoch": 9.777857788793249, "grad_norm": 18.350019454956055, "learning_rate": 1.1130514860125191e-06, "loss": 2.2564, "step": 4671000 }, { "epoch": 9.778904444518874, "grad_norm": 16.811246871948242, "learning_rate": 1.1078072295222274e-06, "loss": 2.2657, "step": 4671500 }, { "epoch": 9.7799511002445, "grad_norm": 21.81740951538086, "learning_rate": 1.1025629730319353e-06, "loss": 2.269, "step": 4672000 }, { "epoch": 9.780997755970125, "grad_norm": 28.763507843017578, "learning_rate": 1.0973187165416437e-06, "loss": 2.2706, "step": 4672500 }, { "epoch": 9.78204441169575, "grad_norm": 17.129709243774414, "learning_rate": 1.0920744600513518e-06, "loss": 2.244, "step": 4673000 }, { "epoch": 9.783091067421376, "grad_norm": 19.074687957763672, "learning_rate": 1.08683020356106e-06, "loss": 2.2745, "step": 4673500 }, { "epoch": 9.784137723147001, "grad_norm": 23.216638565063477, "learning_rate": 1.0815859470707682e-06, "loss": 2.269, "step": 4674000 }, { "epoch": 9.785184378872627, "grad_norm": 17.570180892944336, "learning_rate": 1.0763416905804763e-06, "loss": 2.2448, "step": 4674500 }, { "epoch": 9.786231034598252, "grad_norm": 22.45827293395996, "learning_rate": 1.0710974340901845e-06, "loss": 2.2771, "step": 4675000 }, { "epoch": 9.787277690323878, "grad_norm": 19.741100311279297, "learning_rate": 1.0658531775998926e-06, "loss": 2.2569, "step": 4675500 }, { "epoch": 9.788324346049503, "grad_norm": 21.468969345092773, "learning_rate": 1.060608921109601e-06, "loss": 2.2487, "step": 4676000 }, { "epoch": 9.789371001775129, "grad_norm": 17.941627502441406, "learning_rate": 1.0553646646193088e-06, "loss": 2.2446, "step": 4676500 }, { "epoch": 9.790417657500754, "grad_norm": 22.383468627929688, "learning_rate": 1.0501204081290171e-06, "loss": 2.2458, "step": 4677000 }, { "epoch": 9.79146431322638, "grad_norm": 18.973146438598633, "learning_rate": 1.0448761516387255e-06, "loss": 2.2624, "step": 4677500 }, { "epoch": 9.792510968952005, "grad_norm": 21.674222946166992, "learning_rate": 1.0396318951484334e-06, "loss": 2.2615, "step": 4678000 }, { "epoch": 9.79355762467763, "grad_norm": 23.127756118774414, "learning_rate": 1.0343876386581417e-06, "loss": 2.2712, "step": 4678500 }, { "epoch": 9.794604280403256, "grad_norm": 18.759328842163086, "learning_rate": 1.0291433821678498e-06, "loss": 2.2658, "step": 4679000 }, { "epoch": 9.795650936128881, "grad_norm": 18.244617462158203, "learning_rate": 1.023899125677558e-06, "loss": 2.2761, "step": 4679500 }, { "epoch": 9.796697591854507, "grad_norm": 19.25420379638672, "learning_rate": 1.0186548691872662e-06, "loss": 2.262, "step": 4680000 }, { "epoch": 9.797744247580132, "grad_norm": 18.403242111206055, "learning_rate": 1.0134106126969744e-06, "loss": 2.279, "step": 4680500 }, { "epoch": 9.798790903305758, "grad_norm": 19.29412841796875, "learning_rate": 1.0081663562066825e-06, "loss": 2.2525, "step": 4681000 }, { "epoch": 9.799837559031383, "grad_norm": 21.153820037841797, "learning_rate": 1.0029220997163906e-06, "loss": 2.252, "step": 4681500 }, { "epoch": 9.800884214757009, "grad_norm": 20.39505958557129, "learning_rate": 9.97677843226099e-07, "loss": 2.2637, "step": 4682000 }, { "epoch": 9.801930870482634, "grad_norm": 21.452951431274414, "learning_rate": 9.92433586735807e-07, "loss": 2.2558, "step": 4682500 }, { "epoch": 9.80297752620826, "grad_norm": 20.39303207397461, "learning_rate": 9.871893302455151e-07, "loss": 2.2585, "step": 4683000 }, { "epoch": 9.804024181933885, "grad_norm": 18.960546493530273, "learning_rate": 9.819450737552235e-07, "loss": 2.2817, "step": 4683500 }, { "epoch": 9.80507083765951, "grad_norm": 17.21097755432129, "learning_rate": 9.767008172649314e-07, "loss": 2.2454, "step": 4684000 }, { "epoch": 9.806117493385136, "grad_norm": 18.51789665222168, "learning_rate": 9.714565607746397e-07, "loss": 2.2641, "step": 4684500 }, { "epoch": 9.807164149110761, "grad_norm": 18.883291244506836, "learning_rate": 9.662123042843478e-07, "loss": 2.2453, "step": 4685000 }, { "epoch": 9.808210804836387, "grad_norm": 18.765357971191406, "learning_rate": 9.60968047794056e-07, "loss": 2.2582, "step": 4685500 }, { "epoch": 9.809257460562012, "grad_norm": 21.343475341796875, "learning_rate": 9.557237913037643e-07, "loss": 2.2726, "step": 4686000 }, { "epoch": 9.810304116287638, "grad_norm": 20.576637268066406, "learning_rate": 9.504795348134723e-07, "loss": 2.2637, "step": 4686500 }, { "epoch": 9.811350772013263, "grad_norm": 23.676727294921875, "learning_rate": 9.452352783231805e-07, "loss": 2.2732, "step": 4687000 }, { "epoch": 9.812397427738889, "grad_norm": 20.66800308227539, "learning_rate": 9.399910218328886e-07, "loss": 2.27, "step": 4687500 }, { "epoch": 9.813444083464514, "grad_norm": 17.94548225402832, "learning_rate": 9.347467653425968e-07, "loss": 2.255, "step": 4688000 }, { "epoch": 9.81449073919014, "grad_norm": 19.073945999145508, "learning_rate": 9.29502508852305e-07, "loss": 2.2484, "step": 4688500 }, { "epoch": 9.815537394915765, "grad_norm": 18.851804733276367, "learning_rate": 9.242582523620132e-07, "loss": 2.2387, "step": 4689000 }, { "epoch": 9.81658405064139, "grad_norm": 18.209871292114258, "learning_rate": 9.190139958717214e-07, "loss": 2.2627, "step": 4689500 }, { "epoch": 9.817630706367016, "grad_norm": 25.138797760009766, "learning_rate": 9.137697393814294e-07, "loss": 2.2497, "step": 4690000 }, { "epoch": 9.818677362092641, "grad_norm": 23.020170211791992, "learning_rate": 9.085254828911377e-07, "loss": 2.2636, "step": 4690500 }, { "epoch": 9.819724017818267, "grad_norm": 24.956674575805664, "learning_rate": 9.032812264008459e-07, "loss": 2.2698, "step": 4691000 }, { "epoch": 9.820770673543892, "grad_norm": 19.42409896850586, "learning_rate": 8.980369699105539e-07, "loss": 2.2768, "step": 4691500 }, { "epoch": 9.821817329269518, "grad_norm": 19.584348678588867, "learning_rate": 8.927927134202622e-07, "loss": 2.2615, "step": 4692000 }, { "epoch": 9.822863984995143, "grad_norm": 19.155986785888672, "learning_rate": 8.875484569299703e-07, "loss": 2.2687, "step": 4692500 }, { "epoch": 9.823910640720769, "grad_norm": 18.847322463989258, "learning_rate": 8.823042004396785e-07, "loss": 2.2562, "step": 4693000 }, { "epoch": 9.824957296446394, "grad_norm": 17.108985900878906, "learning_rate": 8.770599439493867e-07, "loss": 2.2592, "step": 4693500 }, { "epoch": 9.82600395217202, "grad_norm": 18.99665641784668, "learning_rate": 8.718156874590948e-07, "loss": 2.2579, "step": 4694000 }, { "epoch": 9.827050607897645, "grad_norm": 46.5167121887207, "learning_rate": 8.66571430968803e-07, "loss": 2.2488, "step": 4694500 }, { "epoch": 9.82809726362327, "grad_norm": 21.663593292236328, "learning_rate": 8.613271744785112e-07, "loss": 2.2681, "step": 4695000 }, { "epoch": 9.829143919348896, "grad_norm": 20.66739273071289, "learning_rate": 8.560829179882194e-07, "loss": 2.2618, "step": 4695500 }, { "epoch": 9.830190575074521, "grad_norm": 18.719072341918945, "learning_rate": 8.508386614979276e-07, "loss": 2.2787, "step": 4696000 }, { "epoch": 9.831237230800147, "grad_norm": 23.17781639099121, "learning_rate": 8.455944050076356e-07, "loss": 2.2482, "step": 4696500 }, { "epoch": 9.832283886525772, "grad_norm": 21.487791061401367, "learning_rate": 8.403501485173439e-07, "loss": 2.2839, "step": 4697000 }, { "epoch": 9.833330542251398, "grad_norm": 20.032194137573242, "learning_rate": 8.35105892027052e-07, "loss": 2.2556, "step": 4697500 }, { "epoch": 9.834377197977023, "grad_norm": 19.92630958557129, "learning_rate": 8.298616355367602e-07, "loss": 2.2645, "step": 4698000 }, { "epoch": 9.835423853702649, "grad_norm": 20.64409637451172, "learning_rate": 8.246173790464683e-07, "loss": 2.2402, "step": 4698500 }, { "epoch": 9.836470509428274, "grad_norm": 20.665287017822266, "learning_rate": 8.193731225561765e-07, "loss": 2.2608, "step": 4699000 }, { "epoch": 9.8375171651539, "grad_norm": 28.996488571166992, "learning_rate": 8.141288660658847e-07, "loss": 2.2557, "step": 4699500 }, { "epoch": 9.838563820879525, "grad_norm": 17.857280731201172, "learning_rate": 8.088846095755928e-07, "loss": 2.2535, "step": 4700000 }, { "epoch": 9.83961047660515, "grad_norm": 21.191497802734375, "learning_rate": 8.036403530853011e-07, "loss": 2.2583, "step": 4700500 }, { "epoch": 9.840657132330776, "grad_norm": 24.469392776489258, "learning_rate": 7.983960965950091e-07, "loss": 2.2704, "step": 4701000 }, { "epoch": 9.841703788056403, "grad_norm": 21.721588134765625, "learning_rate": 7.931518401047174e-07, "loss": 2.2456, "step": 4701500 }, { "epoch": 9.842750443782027, "grad_norm": 19.53451156616211, "learning_rate": 7.879075836144256e-07, "loss": 2.2652, "step": 4702000 }, { "epoch": 9.843797099507654, "grad_norm": 28.134775161743164, "learning_rate": 7.826633271241336e-07, "loss": 2.2671, "step": 4702500 }, { "epoch": 9.844843755233278, "grad_norm": 17.95657730102539, "learning_rate": 7.774190706338418e-07, "loss": 2.2793, "step": 4703000 }, { "epoch": 9.845890410958905, "grad_norm": 20.91094398498535, "learning_rate": 7.721748141435501e-07, "loss": 2.2648, "step": 4703500 }, { "epoch": 9.84693706668453, "grad_norm": 19.924907684326172, "learning_rate": 7.669305576532582e-07, "loss": 2.2494, "step": 4704000 }, { "epoch": 9.847983722410156, "grad_norm": 17.82807159423828, "learning_rate": 7.616863011629664e-07, "loss": 2.2622, "step": 4704500 }, { "epoch": 9.849030378135781, "grad_norm": 17.200380325317383, "learning_rate": 7.564420446726745e-07, "loss": 2.2764, "step": 4705000 }, { "epoch": 9.850077033861407, "grad_norm": 18.583032608032227, "learning_rate": 7.511977881823826e-07, "loss": 2.2643, "step": 4705500 }, { "epoch": 9.851123689587032, "grad_norm": 21.339458465576172, "learning_rate": 7.459535316920909e-07, "loss": 2.264, "step": 4706000 }, { "epoch": 9.852170345312658, "grad_norm": 17.37886619567871, "learning_rate": 7.407092752017991e-07, "loss": 2.2657, "step": 4706500 }, { "epoch": 9.853217001038283, "grad_norm": 21.308313369750977, "learning_rate": 7.354650187115072e-07, "loss": 2.2416, "step": 4707000 }, { "epoch": 9.854263656763909, "grad_norm": 25.233449935913086, "learning_rate": 7.302207622212153e-07, "loss": 2.2697, "step": 4707500 }, { "epoch": 9.855310312489534, "grad_norm": 20.350841522216797, "learning_rate": 7.249765057309235e-07, "loss": 2.2621, "step": 4708000 }, { "epoch": 9.85635696821516, "grad_norm": 19.19670867919922, "learning_rate": 7.197322492406317e-07, "loss": 2.2568, "step": 4708500 }, { "epoch": 9.857403623940785, "grad_norm": 23.235774993896484, "learning_rate": 7.144879927503399e-07, "loss": 2.2406, "step": 4709000 }, { "epoch": 9.85845027966641, "grad_norm": 19.02288055419922, "learning_rate": 7.092437362600481e-07, "loss": 2.2543, "step": 4709500 }, { "epoch": 9.859496935392036, "grad_norm": 20.400854110717773, "learning_rate": 7.039994797697562e-07, "loss": 2.252, "step": 4710000 }, { "epoch": 9.860543591117661, "grad_norm": 19.826478958129883, "learning_rate": 6.987552232794643e-07, "loss": 2.261, "step": 4710500 }, { "epoch": 9.861590246843287, "grad_norm": 20.849102020263672, "learning_rate": 6.935109667891725e-07, "loss": 2.2605, "step": 4711000 }, { "epoch": 9.862636902568912, "grad_norm": 18.095062255859375, "learning_rate": 6.882667102988807e-07, "loss": 2.2409, "step": 4711500 }, { "epoch": 9.863683558294538, "grad_norm": 20.547452926635742, "learning_rate": 6.830224538085889e-07, "loss": 2.2608, "step": 4712000 }, { "epoch": 9.864730214020163, "grad_norm": 19.158432006835938, "learning_rate": 6.777781973182971e-07, "loss": 2.256, "step": 4712500 }, { "epoch": 9.865776869745789, "grad_norm": 18.02251434326172, "learning_rate": 6.725339408280052e-07, "loss": 2.2645, "step": 4713000 }, { "epoch": 9.866823525471414, "grad_norm": 19.674606323242188, "learning_rate": 6.672896843377133e-07, "loss": 2.2424, "step": 4713500 }, { "epoch": 9.86787018119704, "grad_norm": 21.776317596435547, "learning_rate": 6.620454278474215e-07, "loss": 2.2522, "step": 4714000 }, { "epoch": 9.868916836922665, "grad_norm": 20.25851058959961, "learning_rate": 6.568011713571298e-07, "loss": 2.2518, "step": 4714500 }, { "epoch": 9.86996349264829, "grad_norm": 18.139528274536133, "learning_rate": 6.515569148668379e-07, "loss": 2.2582, "step": 4715000 }, { "epoch": 9.871010148373916, "grad_norm": 22.100156784057617, "learning_rate": 6.46312658376546e-07, "loss": 2.2601, "step": 4715500 }, { "epoch": 9.872056804099541, "grad_norm": 17.797283172607422, "learning_rate": 6.410684018862542e-07, "loss": 2.272, "step": 4716000 }, { "epoch": 9.873103459825167, "grad_norm": 22.170669555664062, "learning_rate": 6.358241453959623e-07, "loss": 2.2478, "step": 4716500 }, { "epoch": 9.874150115550792, "grad_norm": 18.495872497558594, "learning_rate": 6.305798889056705e-07, "loss": 2.2731, "step": 4717000 }, { "epoch": 9.875196771276418, "grad_norm": 23.44637107849121, "learning_rate": 6.253356324153788e-07, "loss": 2.2493, "step": 4717500 }, { "epoch": 9.876243427002043, "grad_norm": 18.724061965942383, "learning_rate": 6.200913759250869e-07, "loss": 2.2629, "step": 4718000 }, { "epoch": 9.877290082727669, "grad_norm": 25.033544540405273, "learning_rate": 6.14847119434795e-07, "loss": 2.2553, "step": 4718500 }, { "epoch": 9.878336738453294, "grad_norm": 25.49629020690918, "learning_rate": 6.096028629445032e-07, "loss": 2.2646, "step": 4719000 }, { "epoch": 9.87938339417892, "grad_norm": 18.60964012145996, "learning_rate": 6.043586064542113e-07, "loss": 2.26, "step": 4719500 }, { "epoch": 9.880430049904545, "grad_norm": 24.28184700012207, "learning_rate": 5.991143499639195e-07, "loss": 2.2571, "step": 4720000 }, { "epoch": 9.88147670563017, "grad_norm": 18.379467010498047, "learning_rate": 5.938700934736278e-07, "loss": 2.2434, "step": 4720500 }, { "epoch": 9.882523361355796, "grad_norm": 21.401119232177734, "learning_rate": 5.886258369833359e-07, "loss": 2.2523, "step": 4721000 }, { "epoch": 9.883570017081421, "grad_norm": 18.35260772705078, "learning_rate": 5.83381580493044e-07, "loss": 2.2598, "step": 4721500 }, { "epoch": 9.884616672807047, "grad_norm": 20.72488021850586, "learning_rate": 5.781373240027522e-07, "loss": 2.2437, "step": 4722000 }, { "epoch": 9.885663328532672, "grad_norm": 18.556344985961914, "learning_rate": 5.728930675124604e-07, "loss": 2.2908, "step": 4722500 }, { "epoch": 9.886709984258298, "grad_norm": 18.10731315612793, "learning_rate": 5.676488110221686e-07, "loss": 2.2646, "step": 4723000 }, { "epoch": 9.887756639983923, "grad_norm": 19.581253051757812, "learning_rate": 5.624045545318768e-07, "loss": 2.2826, "step": 4723500 }, { "epoch": 9.888803295709549, "grad_norm": 25.210729598999023, "learning_rate": 5.571602980415849e-07, "loss": 2.2623, "step": 4724000 }, { "epoch": 9.889849951435174, "grad_norm": 20.279621124267578, "learning_rate": 5.51916041551293e-07, "loss": 2.2672, "step": 4724500 }, { "epoch": 9.8908966071608, "grad_norm": 19.507869720458984, "learning_rate": 5.466717850610012e-07, "loss": 2.2699, "step": 4725000 }, { "epoch": 9.891943262886425, "grad_norm": 21.971691131591797, "learning_rate": 5.414275285707094e-07, "loss": 2.2801, "step": 4725500 }, { "epoch": 9.89298991861205, "grad_norm": 18.022375106811523, "learning_rate": 5.361832720804176e-07, "loss": 2.2653, "step": 4726000 }, { "epoch": 9.894036574337676, "grad_norm": 20.217565536499023, "learning_rate": 5.309390155901257e-07, "loss": 2.2698, "step": 4726500 }, { "epoch": 9.895083230063301, "grad_norm": 22.67332649230957, "learning_rate": 5.256947590998339e-07, "loss": 2.271, "step": 4727000 }, { "epoch": 9.896129885788927, "grad_norm": 18.18659019470215, "learning_rate": 5.20450502609542e-07, "loss": 2.2643, "step": 4727500 }, { "epoch": 9.897176541514552, "grad_norm": 17.52239227294922, "learning_rate": 5.152062461192502e-07, "loss": 2.2515, "step": 4728000 }, { "epoch": 9.898223197240178, "grad_norm": 26.433683395385742, "learning_rate": 5.099619896289584e-07, "loss": 2.251, "step": 4728500 }, { "epoch": 9.899269852965803, "grad_norm": 24.14431381225586, "learning_rate": 5.047177331386666e-07, "loss": 2.2413, "step": 4729000 }, { "epoch": 9.900316508691429, "grad_norm": 21.04739761352539, "learning_rate": 4.994734766483747e-07, "loss": 2.2709, "step": 4729500 }, { "epoch": 9.901363164417054, "grad_norm": 19.34337043762207, "learning_rate": 4.942292201580829e-07, "loss": 2.2595, "step": 4730000 }, { "epoch": 9.90240982014268, "grad_norm": 17.984134674072266, "learning_rate": 4.88984963667791e-07, "loss": 2.2507, "step": 4730500 }, { "epoch": 9.903456475868305, "grad_norm": 24.613805770874023, "learning_rate": 4.837407071774992e-07, "loss": 2.2452, "step": 4731000 }, { "epoch": 9.90450313159393, "grad_norm": 19.18947982788086, "learning_rate": 4.784964506872075e-07, "loss": 2.2658, "step": 4731500 }, { "epoch": 9.905549787319556, "grad_norm": 22.026010513305664, "learning_rate": 4.7325219419691557e-07, "loss": 2.267, "step": 4732000 }, { "epoch": 9.906596443045181, "grad_norm": 21.217336654663086, "learning_rate": 4.6800793770662374e-07, "loss": 2.2622, "step": 4732500 }, { "epoch": 9.907643098770807, "grad_norm": 17.97685432434082, "learning_rate": 4.6276368121633185e-07, "loss": 2.2559, "step": 4733000 }, { "epoch": 9.908689754496432, "grad_norm": 18.714689254760742, "learning_rate": 4.575194247260401e-07, "loss": 2.2494, "step": 4733500 }, { "epoch": 9.909736410222058, "grad_norm": 20.3554744720459, "learning_rate": 4.5227516823574824e-07, "loss": 2.2645, "step": 4734000 }, { "epoch": 9.910783065947683, "grad_norm": 20.775136947631836, "learning_rate": 4.470309117454564e-07, "loss": 2.2406, "step": 4734500 }, { "epoch": 9.911829721673309, "grad_norm": 20.16371726989746, "learning_rate": 4.417866552551646e-07, "loss": 2.2439, "step": 4735000 }, { "epoch": 9.912876377398934, "grad_norm": 20.71468734741211, "learning_rate": 4.365423987648727e-07, "loss": 2.2636, "step": 4735500 }, { "epoch": 9.91392303312456, "grad_norm": 19.13040542602539, "learning_rate": 4.3129814227458086e-07, "loss": 2.2446, "step": 4736000 }, { "epoch": 9.914969688850185, "grad_norm": 18.100303649902344, "learning_rate": 4.260538857842891e-07, "loss": 2.2675, "step": 4736500 }, { "epoch": 9.916016344575812, "grad_norm": 19.550617218017578, "learning_rate": 4.2080962929399725e-07, "loss": 2.2739, "step": 4737000 }, { "epoch": 9.917063000301436, "grad_norm": 19.00163459777832, "learning_rate": 4.155653728037054e-07, "loss": 2.2559, "step": 4737500 }, { "epoch": 9.918109656027063, "grad_norm": 18.75982093811035, "learning_rate": 4.103211163134136e-07, "loss": 2.2718, "step": 4738000 }, { "epoch": 9.919156311752689, "grad_norm": 21.60043716430664, "learning_rate": 4.050768598231217e-07, "loss": 2.2655, "step": 4738500 }, { "epoch": 9.920202967478314, "grad_norm": 18.413040161132812, "learning_rate": 3.9983260333282986e-07, "loss": 2.2561, "step": 4739000 }, { "epoch": 9.92124962320394, "grad_norm": 17.17730712890625, "learning_rate": 3.945883468425381e-07, "loss": 2.2468, "step": 4739500 }, { "epoch": 9.922296278929565, "grad_norm": 20.01080894470215, "learning_rate": 3.8934409035224625e-07, "loss": 2.2652, "step": 4740000 }, { "epoch": 9.92334293465519, "grad_norm": 17.521486282348633, "learning_rate": 3.840998338619544e-07, "loss": 2.2629, "step": 4740500 }, { "epoch": 9.924389590380816, "grad_norm": 22.583168029785156, "learning_rate": 3.7885557737166253e-07, "loss": 2.2707, "step": 4741000 }, { "epoch": 9.925436246106441, "grad_norm": 21.256816864013672, "learning_rate": 3.7361132088137076e-07, "loss": 2.284, "step": 4741500 }, { "epoch": 9.926482901832067, "grad_norm": 27.462726593017578, "learning_rate": 3.683670643910789e-07, "loss": 2.2574, "step": 4742000 }, { "epoch": 9.927529557557692, "grad_norm": 21.642398834228516, "learning_rate": 3.631228079007871e-07, "loss": 2.2587, "step": 4742500 }, { "epoch": 9.928576213283318, "grad_norm": 19.46842384338379, "learning_rate": 3.5787855141049526e-07, "loss": 2.2706, "step": 4743000 }, { "epoch": 9.929622869008943, "grad_norm": 22.422950744628906, "learning_rate": 3.526342949202034e-07, "loss": 2.2646, "step": 4743500 }, { "epoch": 9.930669524734569, "grad_norm": 20.29638671875, "learning_rate": 3.473900384299116e-07, "loss": 2.2535, "step": 4744000 }, { "epoch": 9.931716180460194, "grad_norm": 20.804080963134766, "learning_rate": 3.4214578193961976e-07, "loss": 2.242, "step": 4744500 }, { "epoch": 9.93276283618582, "grad_norm": 18.67584228515625, "learning_rate": 3.369015254493279e-07, "loss": 2.2604, "step": 4745000 }, { "epoch": 9.933809491911445, "grad_norm": 16.70514488220215, "learning_rate": 3.316572689590361e-07, "loss": 2.2658, "step": 4745500 }, { "epoch": 9.93485614763707, "grad_norm": 18.51653480529785, "learning_rate": 3.2641301246874426e-07, "loss": 2.2392, "step": 4746000 }, { "epoch": 9.935902803362696, "grad_norm": 21.45918846130371, "learning_rate": 3.211687559784524e-07, "loss": 2.2655, "step": 4746500 }, { "epoch": 9.936949459088321, "grad_norm": 18.553783416748047, "learning_rate": 3.159244994881606e-07, "loss": 2.2575, "step": 4747000 }, { "epoch": 9.937996114813947, "grad_norm": 18.62192726135254, "learning_rate": 3.1068024299786877e-07, "loss": 2.2536, "step": 4747500 }, { "epoch": 9.939042770539572, "grad_norm": 23.453432083129883, "learning_rate": 3.054359865075769e-07, "loss": 2.2529, "step": 4748000 }, { "epoch": 9.940089426265198, "grad_norm": 23.587684631347656, "learning_rate": 3.001917300172851e-07, "loss": 2.2338, "step": 4748500 }, { "epoch": 9.941136081990823, "grad_norm": 21.109851837158203, "learning_rate": 2.949474735269932e-07, "loss": 2.2648, "step": 4749000 }, { "epoch": 9.942182737716449, "grad_norm": 20.032621383666992, "learning_rate": 2.8970321703670144e-07, "loss": 2.2687, "step": 4749500 }, { "epoch": 9.943229393442074, "grad_norm": 17.290164947509766, "learning_rate": 2.844589605464096e-07, "loss": 2.2654, "step": 4750000 }, { "epoch": 9.9442760491677, "grad_norm": 26.638444900512695, "learning_rate": 2.792147040561177e-07, "loss": 2.2435, "step": 4750500 }, { "epoch": 9.945322704893325, "grad_norm": 20.328426361083984, "learning_rate": 2.7397044756582594e-07, "loss": 2.2641, "step": 4751000 }, { "epoch": 9.94636936061895, "grad_norm": 19.68665313720703, "learning_rate": 2.687261910755341e-07, "loss": 2.2553, "step": 4751500 }, { "epoch": 9.947416016344576, "grad_norm": 25.949445724487305, "learning_rate": 2.634819345852422e-07, "loss": 2.2649, "step": 4752000 }, { "epoch": 9.948462672070201, "grad_norm": 21.04167938232422, "learning_rate": 2.5823767809495044e-07, "loss": 2.267, "step": 4752500 }, { "epoch": 9.949509327795827, "grad_norm": 21.236391067504883, "learning_rate": 2.529934216046586e-07, "loss": 2.2417, "step": 4753000 }, { "epoch": 9.950555983521452, "grad_norm": 17.297643661499023, "learning_rate": 2.477491651143667e-07, "loss": 2.2622, "step": 4753500 }, { "epoch": 9.951602639247078, "grad_norm": 20.01759910583496, "learning_rate": 2.4250490862407495e-07, "loss": 2.2468, "step": 4754000 }, { "epoch": 9.952649294972703, "grad_norm": 15.493158340454102, "learning_rate": 2.372606521337831e-07, "loss": 2.2445, "step": 4754500 }, { "epoch": 9.953695950698329, "grad_norm": 20.48558235168457, "learning_rate": 2.3201639564349128e-07, "loss": 2.2489, "step": 4755000 }, { "epoch": 9.954742606423954, "grad_norm": 21.967029571533203, "learning_rate": 2.2677213915319942e-07, "loss": 2.2631, "step": 4755500 }, { "epoch": 9.95578926214958, "grad_norm": 23.87539291381836, "learning_rate": 2.215278826629076e-07, "loss": 2.2529, "step": 4756000 }, { "epoch": 9.956835917875205, "grad_norm": 24.357261657714844, "learning_rate": 2.1628362617261579e-07, "loss": 2.2472, "step": 4756500 }, { "epoch": 9.95788257360083, "grad_norm": 20.10662269592285, "learning_rate": 2.1103936968232393e-07, "loss": 2.264, "step": 4757000 }, { "epoch": 9.958929229326456, "grad_norm": 22.3586368560791, "learning_rate": 2.057951131920321e-07, "loss": 2.263, "step": 4757500 }, { "epoch": 9.959975885052081, "grad_norm": 19.867555618286133, "learning_rate": 2.005508567017403e-07, "loss": 2.2593, "step": 4758000 }, { "epoch": 9.961022540777707, "grad_norm": 23.947956085205078, "learning_rate": 1.9530660021144843e-07, "loss": 2.2428, "step": 4758500 }, { "epoch": 9.962069196503332, "grad_norm": 18.215438842773438, "learning_rate": 1.900623437211566e-07, "loss": 2.2572, "step": 4759000 }, { "epoch": 9.963115852228958, "grad_norm": 23.311100006103516, "learning_rate": 1.8481808723086476e-07, "loss": 2.2376, "step": 4759500 }, { "epoch": 9.964162507954583, "grad_norm": 21.205135345458984, "learning_rate": 1.7957383074057293e-07, "loss": 2.2755, "step": 4760000 }, { "epoch": 9.965209163680209, "grad_norm": 20.45595359802246, "learning_rate": 1.743295742502811e-07, "loss": 2.2541, "step": 4760500 }, { "epoch": 9.966255819405834, "grad_norm": 22.532546997070312, "learning_rate": 1.6908531775998927e-07, "loss": 2.2517, "step": 4761000 }, { "epoch": 9.96730247513146, "grad_norm": 19.538829803466797, "learning_rate": 1.6384106126969744e-07, "loss": 2.264, "step": 4761500 }, { "epoch": 9.968349130857085, "grad_norm": 20.408369064331055, "learning_rate": 1.585968047794056e-07, "loss": 2.243, "step": 4762000 }, { "epoch": 9.96939578658271, "grad_norm": 21.553712844848633, "learning_rate": 1.5335254828911377e-07, "loss": 2.2578, "step": 4762500 }, { "epoch": 9.970442442308336, "grad_norm": 22.05225372314453, "learning_rate": 1.4810829179882194e-07, "loss": 2.26, "step": 4763000 }, { "epoch": 9.971489098033961, "grad_norm": 19.386775970458984, "learning_rate": 1.428640353085301e-07, "loss": 2.2527, "step": 4763500 }, { "epoch": 9.972535753759587, "grad_norm": 17.96553611755371, "learning_rate": 1.3761977881823827e-07, "loss": 2.2472, "step": 4764000 }, { "epoch": 9.973582409485212, "grad_norm": 25.7299861907959, "learning_rate": 1.3237552232794644e-07, "loss": 2.2518, "step": 4764500 }, { "epoch": 9.974629065210838, "grad_norm": 21.224266052246094, "learning_rate": 1.271312658376546e-07, "loss": 2.2494, "step": 4765000 }, { "epoch": 9.975675720936463, "grad_norm": 19.132549285888672, "learning_rate": 1.2188700934736278e-07, "loss": 2.259, "step": 4765500 }, { "epoch": 9.976722376662089, "grad_norm": 22.827224731445312, "learning_rate": 1.1664275285707094e-07, "loss": 2.2522, "step": 4766000 }, { "epoch": 9.977769032387714, "grad_norm": 20.774799346923828, "learning_rate": 1.1139849636677911e-07, "loss": 2.2578, "step": 4766500 }, { "epoch": 9.97881568811334, "grad_norm": 21.217575073242188, "learning_rate": 1.0615423987648727e-07, "loss": 2.2554, "step": 4767000 }, { "epoch": 9.979862343838965, "grad_norm": 18.233169555664062, "learning_rate": 1.0090998338619545e-07, "loss": 2.2475, "step": 4767500 }, { "epoch": 9.98090899956459, "grad_norm": 18.959636688232422, "learning_rate": 9.566572689590361e-08, "loss": 2.2482, "step": 4768000 }, { "epoch": 9.981955655290216, "grad_norm": 19.252914428710938, "learning_rate": 9.042147040561178e-08, "loss": 2.251, "step": 4768500 }, { "epoch": 9.983002311015841, "grad_norm": 18.45645523071289, "learning_rate": 8.517721391531994e-08, "loss": 2.2566, "step": 4769000 }, { "epoch": 9.984048966741467, "grad_norm": 18.53115463256836, "learning_rate": 7.993295742502812e-08, "loss": 2.2497, "step": 4769500 }, { "epoch": 9.985095622467092, "grad_norm": 24.482879638671875, "learning_rate": 7.468870093473627e-08, "loss": 2.2439, "step": 4770000 }, { "epoch": 9.986142278192718, "grad_norm": 19.729045867919922, "learning_rate": 6.944444444444445e-08, "loss": 2.2364, "step": 4770500 }, { "epoch": 9.987188933918343, "grad_norm": 18.791532516479492, "learning_rate": 6.420018795415261e-08, "loss": 2.2614, "step": 4771000 }, { "epoch": 9.98823558964397, "grad_norm": 18.017929077148438, "learning_rate": 5.895593146386079e-08, "loss": 2.2729, "step": 4771500 }, { "epoch": 9.989282245369594, "grad_norm": 22.297151565551758, "learning_rate": 5.371167497356895e-08, "loss": 2.2494, "step": 4772000 }, { "epoch": 9.990328901095221, "grad_norm": 21.083171844482422, "learning_rate": 4.846741848327712e-08, "loss": 2.2704, "step": 4772500 }, { "epoch": 9.991375556820845, "grad_norm": 25.129867553710938, "learning_rate": 4.3223161992985285e-08, "loss": 2.2494, "step": 4773000 }, { "epoch": 9.992422212546472, "grad_norm": 20.42827033996582, "learning_rate": 3.797890550269345e-08, "loss": 2.2534, "step": 4773500 }, { "epoch": 9.993468868272098, "grad_norm": Infinity, "learning_rate": 3.273464901240162e-08, "loss": 2.2633, "step": 4774000 }, { "epoch": 9.994515523997723, "grad_norm": 17.677579879760742, "learning_rate": 2.7490392522109788e-08, "loss": 2.2623, "step": 4774500 }, { "epoch": 9.995562179723349, "grad_norm": 27.24789047241211, "learning_rate": 2.2246136031817955e-08, "loss": 2.258, "step": 4775000 }, { "epoch": 9.996608835448974, "grad_norm": 17.149065017700195, "learning_rate": 1.7001879541526123e-08, "loss": 2.2485, "step": 4775500 }, { "epoch": 9.9976554911746, "grad_norm": 21.983835220336914, "learning_rate": 1.1757623051234289e-08, "loss": 2.2749, "step": 4776000 }, { "epoch": 9.998702146900225, "grad_norm": 22.031213760375977, "learning_rate": 6.513366560942456e-09, "loss": 2.2704, "step": 4776500 }, { "epoch": 9.99974880262585, "grad_norm": 18.576507568359375, "learning_rate": 1.2691100706506233e-09, "loss": 2.2465, "step": 4777000 } ], "logging_steps": 500, "max_steps": 4777120, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8.355551015541228e+17, "train_batch_size": 40, "trial_name": null, "trial_params": null }