{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6914119359534207, "eval_steps": 100, "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009097525473071324, "grad_norm": 1.0602493286132812, "learning_rate": 1.2121212121212122e-06, "loss": 1.7156932830810547, "step": 5 }, { "epoch": 0.001819505094614265, "grad_norm": 1.1577719449996948, "learning_rate": 2.7272727272727272e-06, "loss": 1.6629371643066406, "step": 10 }, { "epoch": 0.0027292576419213972, "grad_norm": 1.0288419723510742, "learning_rate": 4.242424242424243e-06, "loss": 1.6706295013427734, "step": 15 }, { "epoch": 0.00363901018922853, "grad_norm": 2.129403829574585, "learning_rate": 5.7575757575757586e-06, "loss": 1.7363752365112304, "step": 20 }, { "epoch": 0.004548762736535662, "grad_norm": 1.9468326568603516, "learning_rate": 7.272727272727272e-06, "loss": 1.7111135482788087, "step": 25 }, { "epoch": 0.0054585152838427945, "grad_norm": 1.1269357204437256, "learning_rate": 8.787878787878788e-06, "loss": 1.6924203872680663, "step": 30 }, { "epoch": 0.006368267831149927, "grad_norm": 1.4021248817443848, "learning_rate": 1.0303030303030304e-05, "loss": 1.658310317993164, "step": 35 }, { "epoch": 0.00727802037845706, "grad_norm": 1.313381314277649, "learning_rate": 1.1818181818181819e-05, "loss": 1.5383296012878418, "step": 40 }, { "epoch": 0.008187772925764192, "grad_norm": 2.4359891414642334, "learning_rate": 1.3333333333333333e-05, "loss": 1.4302565574645996, "step": 45 }, { "epoch": 0.009097525473071324, "grad_norm": 1.6459542512893677, "learning_rate": 1.484848484848485e-05, "loss": 1.2602953910827637, "step": 50 }, { "epoch": 0.010007278020378457, "grad_norm": 0.7953159213066101, "learning_rate": 1.6363636363636366e-05, "loss": 1.204326343536377, "step": 55 }, { "epoch": 0.010917030567685589, "grad_norm": 0.5824465155601501, "learning_rate": 1.787878787878788e-05, "loss": 1.068561840057373, "step": 60 }, { "epoch": 0.011826783114992722, "grad_norm": 0.39265626668930054, "learning_rate": 1.9393939393939395e-05, "loss": 0.9570062637329102, "step": 65 }, { "epoch": 0.012736535662299854, "grad_norm": 0.3387283384799957, "learning_rate": 2.090909090909091e-05, "loss": 0.9454713821411133, "step": 70 }, { "epoch": 0.013646288209606987, "grad_norm": 0.3182811141014099, "learning_rate": 2.2424242424242424e-05, "loss": 0.8901592254638672, "step": 75 }, { "epoch": 0.01455604075691412, "grad_norm": 0.2735312879085541, "learning_rate": 2.393939393939394e-05, "loss": 0.8491583824157715, "step": 80 }, { "epoch": 0.015465793304221253, "grad_norm": 0.2376435250043869, "learning_rate": 2.5454545454545454e-05, "loss": 0.8109179496765136, "step": 85 }, { "epoch": 0.016375545851528384, "grad_norm": 0.2161586880683899, "learning_rate": 2.696969696969697e-05, "loss": 0.76962308883667, "step": 90 }, { "epoch": 0.017285298398835518, "grad_norm": 0.19587980210781097, "learning_rate": 2.8484848484848486e-05, "loss": 0.7301986694335938, "step": 95 }, { "epoch": 0.018195050946142648, "grad_norm": 0.20971694588661194, "learning_rate": 3e-05, "loss": 0.7269618034362793, "step": 100 }, { "epoch": 0.018195050946142648, "eval_loss": 2.605874538421631, "eval_runtime": 1120.0905, "eval_samples_per_second": 33.935, "eval_steps_per_second": 8.484, "step": 100 }, { "epoch": 0.01910480349344978, "grad_norm": 0.10413152724504471, "learning_rate": 3.151515151515151e-05, "loss": 0.3250573635101318, "step": 105 }, { "epoch": 0.020014556040756915, "grad_norm": 0.09383206814527512, "learning_rate": 3.303030303030303e-05, "loss": 0.3277724742889404, "step": 110 }, { "epoch": 0.020924308588064048, "grad_norm": 0.1195850670337677, "learning_rate": 3.454545454545455e-05, "loss": 0.3215961217880249, "step": 115 }, { "epoch": 0.021834061135371178, "grad_norm": 0.0715397521853447, "learning_rate": 3.606060606060606e-05, "loss": 0.3120795965194702, "step": 120 }, { "epoch": 0.02274381368267831, "grad_norm": 0.068007692694664, "learning_rate": 3.757575757575758e-05, "loss": 0.2964257955551147, "step": 125 }, { "epoch": 0.023653566229985445, "grad_norm": 0.09345484524965286, "learning_rate": 3.909090909090909e-05, "loss": 0.30776252746582033, "step": 130 }, { "epoch": 0.024563318777292575, "grad_norm": 0.05577846243977547, "learning_rate": 4.0606060606060606e-05, "loss": 0.3180255889892578, "step": 135 }, { "epoch": 0.025473071324599708, "grad_norm": 0.05919989198446274, "learning_rate": 4.212121212121212e-05, "loss": 0.31608285903930666, "step": 140 }, { "epoch": 0.02638282387190684, "grad_norm": 0.05644674599170685, "learning_rate": 4.3636363636363636e-05, "loss": 0.2993780136108398, "step": 145 }, { "epoch": 0.027292576419213975, "grad_norm": 0.059986088424921036, "learning_rate": 4.515151515151516e-05, "loss": 0.2931638479232788, "step": 150 }, { "epoch": 0.028202328966521105, "grad_norm": 0.05941484495997429, "learning_rate": 4.666666666666667e-05, "loss": 0.29284651279449464, "step": 155 }, { "epoch": 0.02911208151382824, "grad_norm": 0.0579044483602047, "learning_rate": 4.8181818181818186e-05, "loss": 0.2927037000656128, "step": 160 }, { "epoch": 0.030021834061135372, "grad_norm": 0.061985693871974945, "learning_rate": 4.9696969696969694e-05, "loss": 0.28671720027923586, "step": 165 }, { "epoch": 0.030931586608442505, "grad_norm": 0.05715535953640938, "learning_rate": 4.999993064772809e-05, "loss": 0.2817929744720459, "step": 170 }, { "epoch": 0.03184133915574964, "grad_norm": 0.06549780815839767, "learning_rate": 4.999964890478288e-05, "loss": 0.27853829860687257, "step": 175 }, { "epoch": 0.03275109170305677, "grad_norm": 0.05948757752776146, "learning_rate": 4.999915043908795e-05, "loss": 0.27522289752960205, "step": 180 }, { "epoch": 0.0336608442503639, "grad_norm": 0.06262889504432678, "learning_rate": 4.9998435254964515e-05, "loss": 0.270997428894043, "step": 185 }, { "epoch": 0.034570596797671035, "grad_norm": 0.06916829943656921, "learning_rate": 4.999750335861253e-05, "loss": 0.2788438558578491, "step": 190 }, { "epoch": 0.035480349344978165, "grad_norm": 0.06128217652440071, "learning_rate": 4.9996354758110624e-05, "loss": 0.25649352073669435, "step": 195 }, { "epoch": 0.036390101892285295, "grad_norm": 0.06704027950763702, "learning_rate": 4.999498946341606e-05, "loss": 0.25619523525238036, "step": 200 }, { "epoch": 0.03729985443959243, "grad_norm": 0.061678580939769745, "learning_rate": 4.999340748636462e-05, "loss": 0.24956226348876953, "step": 205 }, { "epoch": 0.03820960698689956, "grad_norm": 0.07328873127698898, "learning_rate": 4.999160884067051e-05, "loss": 0.26169676780700685, "step": 210 }, { "epoch": 0.0391193595342067, "grad_norm": 0.08287990838289261, "learning_rate": 4.9989593541926246e-05, "loss": 0.2574604034423828, "step": 215 }, { "epoch": 0.04002911208151383, "grad_norm": 0.06787359714508057, "learning_rate": 4.9987361607602525e-05, "loss": 0.25351409912109374, "step": 220 }, { "epoch": 0.04093886462882096, "grad_norm": 0.06695502996444702, "learning_rate": 4.998491305704805e-05, "loss": 0.24522039890289307, "step": 225 }, { "epoch": 0.041848617176128096, "grad_norm": 0.08872214704751968, "learning_rate": 4.9982247911489375e-05, "loss": 0.2581867933273315, "step": 230 }, { "epoch": 0.042758369723435226, "grad_norm": 0.07637131959199905, "learning_rate": 4.9979366194030743e-05, "loss": 0.25569658279418944, "step": 235 }, { "epoch": 0.043668122270742356, "grad_norm": 0.08158119022846222, "learning_rate": 4.997626792965385e-05, "loss": 0.2529409646987915, "step": 240 }, { "epoch": 0.04457787481804949, "grad_norm": 0.07529161125421524, "learning_rate": 4.997295314521766e-05, "loss": 0.24049024581909179, "step": 245 }, { "epoch": 0.04548762736535662, "grad_norm": 0.08860139548778534, "learning_rate": 4.996942186945813e-05, "loss": 0.2490522861480713, "step": 250 }, { "epoch": 0.04639737991266375, "grad_norm": 0.0850321501493454, "learning_rate": 4.9965674132988005e-05, "loss": 0.24180831909179687, "step": 255 }, { "epoch": 0.04730713245997089, "grad_norm": 0.07556115090847015, "learning_rate": 4.996170996829653e-05, "loss": 0.2509631872177124, "step": 260 }, { "epoch": 0.04821688500727802, "grad_norm": 0.07971206307411194, "learning_rate": 4.995752940974918e-05, "loss": 0.24398891925811766, "step": 265 }, { "epoch": 0.04912663755458515, "grad_norm": 0.09149336814880371, "learning_rate": 4.9953132493587344e-05, "loss": 0.2300492286682129, "step": 270 }, { "epoch": 0.050036390101892286, "grad_norm": 0.08265820890665054, "learning_rate": 4.9948519257928034e-05, "loss": 0.24246792793273925, "step": 275 }, { "epoch": 0.050946142649199416, "grad_norm": 0.10328587144613266, "learning_rate": 4.9943689742763534e-05, "loss": 0.2367171049118042, "step": 280 }, { "epoch": 0.05185589519650655, "grad_norm": 0.0836917981505394, "learning_rate": 4.993864398996105e-05, "loss": 0.23215813636779786, "step": 285 }, { "epoch": 0.05276564774381368, "grad_norm": 0.09475161135196686, "learning_rate": 4.99333820432624e-05, "loss": 0.2350748062133789, "step": 290 }, { "epoch": 0.05367540029112081, "grad_norm": 0.08040128648281097, "learning_rate": 4.992790394828355e-05, "loss": 0.23253886699676513, "step": 295 }, { "epoch": 0.05458515283842795, "grad_norm": 0.08852150291204453, "learning_rate": 4.992220975251428e-05, "loss": 0.23856515884399415, "step": 300 }, { "epoch": 0.05549490538573508, "grad_norm": 0.09565229713916779, "learning_rate": 4.991629950531775e-05, "loss": 0.23311660289764405, "step": 305 }, { "epoch": 0.05640465793304221, "grad_norm": 0.08158160001039505, "learning_rate": 4.991017325793009e-05, "loss": 0.22467944622039795, "step": 310 }, { "epoch": 0.05731441048034935, "grad_norm": 0.07746429741382599, "learning_rate": 4.990383106345994e-05, "loss": 0.229844069480896, "step": 315 }, { "epoch": 0.05822416302765648, "grad_norm": 0.08564355969429016, "learning_rate": 4.989727297688797e-05, "loss": 0.22414517402648926, "step": 320 }, { "epoch": 0.05913391557496361, "grad_norm": 0.07517435401678085, "learning_rate": 4.9890499055066435e-05, "loss": 0.2236532211303711, "step": 325 }, { "epoch": 0.060043668122270744, "grad_norm": 0.111734539270401, "learning_rate": 4.988350935671869e-05, "loss": 0.21474847793579102, "step": 330 }, { "epoch": 0.060953420669577874, "grad_norm": 0.09906989336013794, "learning_rate": 4.987630394243866e-05, "loss": 0.23321933746337892, "step": 335 }, { "epoch": 0.06186317321688501, "grad_norm": 0.10131457448005676, "learning_rate": 4.98688828746903e-05, "loss": 0.2310662031173706, "step": 340 }, { "epoch": 0.06277292576419213, "grad_norm": 0.09203507006168365, "learning_rate": 4.986124621780708e-05, "loss": 0.22021169662475587, "step": 345 }, { "epoch": 0.06368267831149928, "grad_norm": 0.09505912661552429, "learning_rate": 4.9853394037991416e-05, "loss": 0.2197155237197876, "step": 350 }, { "epoch": 0.06459243085880641, "grad_norm": 0.09038657695055008, "learning_rate": 4.984532640331412e-05, "loss": 0.22066287994384765, "step": 355 }, { "epoch": 0.06550218340611354, "grad_norm": 0.09707064181566238, "learning_rate": 4.9837043383713753e-05, "loss": 0.22455451488494874, "step": 360 }, { "epoch": 0.06641193595342067, "grad_norm": 0.10367228090763092, "learning_rate": 4.98285450509961e-05, "loss": 0.21993820667266845, "step": 365 }, { "epoch": 0.0673216885007278, "grad_norm": 0.12229471653699875, "learning_rate": 4.9819831478833456e-05, "loss": 0.2168867588043213, "step": 370 }, { "epoch": 0.06823144104803494, "grad_norm": 0.0964592918753624, "learning_rate": 4.981090274276406e-05, "loss": 0.21579203605651856, "step": 375 }, { "epoch": 0.06914119359534207, "grad_norm": 0.09400496631860733, "learning_rate": 4.980175892019141e-05, "loss": 0.20972180366516113, "step": 380 }, { "epoch": 0.0700509461426492, "grad_norm": 0.08158645778894424, "learning_rate": 4.9792400090383594e-05, "loss": 0.22148358821868896, "step": 385 }, { "epoch": 0.07096069868995633, "grad_norm": 0.10916394740343094, "learning_rate": 4.978282633447261e-05, "loss": 0.2214418649673462, "step": 390 }, { "epoch": 0.07187045123726346, "grad_norm": 0.11138810962438583, "learning_rate": 4.9773037735453636e-05, "loss": 0.21814754009246826, "step": 395 }, { "epoch": 0.07278020378457059, "grad_norm": 0.10914396494626999, "learning_rate": 4.9763034378184365e-05, "loss": 0.21310818195343018, "step": 400 }, { "epoch": 0.07368995633187773, "grad_norm": 0.1043366864323616, "learning_rate": 4.975281634938421e-05, "loss": 0.21266789436340333, "step": 405 }, { "epoch": 0.07459970887918486, "grad_norm": 0.1036868542432785, "learning_rate": 4.9742383737633594e-05, "loss": 0.21606721878051757, "step": 410 }, { "epoch": 0.075509461426492, "grad_norm": 0.11640442907810211, "learning_rate": 4.9731736633373144e-05, "loss": 0.21532948017120362, "step": 415 }, { "epoch": 0.07641921397379912, "grad_norm": 0.11219926178455353, "learning_rate": 4.9720875128902956e-05, "loss": 0.2191627025604248, "step": 420 }, { "epoch": 0.07732896652110625, "grad_norm": 0.12103637307882309, "learning_rate": 4.970979931838176e-05, "loss": 0.20938868522644044, "step": 425 }, { "epoch": 0.0782387190684134, "grad_norm": 0.13274189829826355, "learning_rate": 4.96985092978261e-05, "loss": 0.21792960166931152, "step": 430 }, { "epoch": 0.07914847161572053, "grad_norm": 0.11164513230323792, "learning_rate": 4.968700516510954e-05, "loss": 0.2022618055343628, "step": 435 }, { "epoch": 0.08005822416302766, "grad_norm": 0.09532847255468369, "learning_rate": 4.967528701996174e-05, "loss": 0.21255812644958497, "step": 440 }, { "epoch": 0.08096797671033479, "grad_norm": 0.10279258340597153, "learning_rate": 4.96633549639677e-05, "loss": 0.20683050155639648, "step": 445 }, { "epoch": 0.08187772925764192, "grad_norm": 0.1257462352514267, "learning_rate": 4.965120910056677e-05, "loss": 0.21419920921325683, "step": 450 }, { "epoch": 0.08278748180494905, "grad_norm": 0.11663137376308441, "learning_rate": 4.963884953505186e-05, "loss": 0.2072287082672119, "step": 455 }, { "epoch": 0.08369723435225619, "grad_norm": 0.10488224029541016, "learning_rate": 4.96262763745684e-05, "loss": 0.1982678532600403, "step": 460 }, { "epoch": 0.08460698689956332, "grad_norm": 0.11801692098379135, "learning_rate": 4.961348972811354e-05, "loss": 0.20662031173706055, "step": 465 }, { "epoch": 0.08551673944687045, "grad_norm": 0.11318827420473099, "learning_rate": 4.96004897065351e-05, "loss": 0.20947303771972656, "step": 470 }, { "epoch": 0.08642649199417758, "grad_norm": 0.13409486413002014, "learning_rate": 4.95872764225307e-05, "loss": 0.19670876264572143, "step": 475 }, { "epoch": 0.08733624454148471, "grad_norm": 0.14440792798995972, "learning_rate": 4.957384999064672e-05, "loss": 0.19842848777770997, "step": 480 }, { "epoch": 0.08824599708879186, "grad_norm": 0.12246996909379959, "learning_rate": 4.956021052727731e-05, "loss": 0.20318071842193602, "step": 485 }, { "epoch": 0.08915574963609899, "grad_norm": 0.13437233865261078, "learning_rate": 4.954635815066342e-05, "loss": 0.21675212383270265, "step": 490 }, { "epoch": 0.09006550218340612, "grad_norm": 0.11109672486782074, "learning_rate": 4.9532292980891744e-05, "loss": 0.2100757837295532, "step": 495 }, { "epoch": 0.09097525473071325, "grad_norm": 0.1388893872499466, "learning_rate": 4.9518015139893675e-05, "loss": 0.20303285121917725, "step": 500 }, { "epoch": 0.09188500727802038, "grad_norm": 0.13239721953868866, "learning_rate": 4.950352475144427e-05, "loss": 0.2152268409729004, "step": 505 }, { "epoch": 0.0927947598253275, "grad_norm": 0.12834979593753815, "learning_rate": 4.948882194116119e-05, "loss": 0.20799248218536376, "step": 510 }, { "epoch": 0.09370451237263465, "grad_norm": 0.11886704713106155, "learning_rate": 4.947390683650354e-05, "loss": 0.20394976139068605, "step": 515 }, { "epoch": 0.09461426491994178, "grad_norm": 0.11398876458406448, "learning_rate": 4.945877956677083e-05, "loss": 0.2091092586517334, "step": 520 }, { "epoch": 0.09552401746724891, "grad_norm": 0.1422540694475174, "learning_rate": 4.944344026310186e-05, "loss": 0.19564238786697388, "step": 525 }, { "epoch": 0.09643377001455604, "grad_norm": 0.11359584331512451, "learning_rate": 4.9427889058473535e-05, "loss": 0.20493624210357667, "step": 530 }, { "epoch": 0.09734352256186317, "grad_norm": 0.11703553050756454, "learning_rate": 4.941212608769974e-05, "loss": 0.2098615884780884, "step": 535 }, { "epoch": 0.0982532751091703, "grad_norm": 0.14552047848701477, "learning_rate": 4.939615148743017e-05, "loss": 0.20382182598114013, "step": 540 }, { "epoch": 0.09916302765647744, "grad_norm": 0.13178016245365143, "learning_rate": 4.937996539614914e-05, "loss": 0.19901862144470214, "step": 545 }, { "epoch": 0.10007278020378457, "grad_norm": 0.635392427444458, "learning_rate": 4.936356795417439e-05, "loss": 0.20694944858551026, "step": 550 }, { "epoch": 0.1009825327510917, "grad_norm": 0.15019077062606812, "learning_rate": 4.934695930365586e-05, "loss": 0.19313746690750122, "step": 555 }, { "epoch": 0.10189228529839883, "grad_norm": 0.12941956520080566, "learning_rate": 4.9330139588574474e-05, "loss": 0.19671722650527954, "step": 560 }, { "epoch": 0.10280203784570596, "grad_norm": 0.13818831741809845, "learning_rate": 4.931310895474088e-05, "loss": 0.20026786327362062, "step": 565 }, { "epoch": 0.1037117903930131, "grad_norm": 0.12011194974184036, "learning_rate": 4.929586754979417e-05, "loss": 0.1932437539100647, "step": 570 }, { "epoch": 0.10462154294032024, "grad_norm": 0.1345364898443222, "learning_rate": 4.9278415523200644e-05, "loss": 0.20245940685272218, "step": 575 }, { "epoch": 0.10553129548762737, "grad_norm": 0.13281017541885376, "learning_rate": 4.926075302625247e-05, "loss": 0.19864981174468993, "step": 580 }, { "epoch": 0.1064410480349345, "grad_norm": 0.13465586304664612, "learning_rate": 4.924288021206639e-05, "loss": 0.19573183059692384, "step": 585 }, { "epoch": 0.10735080058224163, "grad_norm": 0.15225961804389954, "learning_rate": 4.9224797235582396e-05, "loss": 0.19946801662445068, "step": 590 }, { "epoch": 0.10826055312954876, "grad_norm": 0.12816746532917023, "learning_rate": 4.92065042535624e-05, "loss": 0.19851526021957397, "step": 595 }, { "epoch": 0.1091703056768559, "grad_norm": 0.13802853226661682, "learning_rate": 4.9188001424588824e-05, "loss": 0.19321763515472412, "step": 600 }, { "epoch": 0.11008005822416303, "grad_norm": 0.17504797875881195, "learning_rate": 4.9169288909063295e-05, "loss": 0.2032616138458252, "step": 605 }, { "epoch": 0.11098981077147016, "grad_norm": 0.13544194400310516, "learning_rate": 4.91503668692052e-05, "loss": 0.2011256456375122, "step": 610 }, { "epoch": 0.11189956331877729, "grad_norm": 1.3976134061813354, "learning_rate": 4.91312354690503e-05, "loss": 0.19916868209838867, "step": 615 }, { "epoch": 0.11280931586608442, "grad_norm": 0.1465059071779251, "learning_rate": 4.91118948744493e-05, "loss": 0.19487457275390624, "step": 620 }, { "epoch": 0.11371906841339156, "grad_norm": 0.12103168666362762, "learning_rate": 4.909234525306645e-05, "loss": 0.1907251238822937, "step": 625 }, { "epoch": 0.1146288209606987, "grad_norm": 0.12660574913024902, "learning_rate": 4.907258677437802e-05, "loss": 0.19327253103256226, "step": 630 }, { "epoch": 0.11553857350800582, "grad_norm": 0.1347813606262207, "learning_rate": 4.90526196096709e-05, "loss": 0.19637736082077026, "step": 635 }, { "epoch": 0.11644832605531295, "grad_norm": 0.14953652024269104, "learning_rate": 4.903244393204107e-05, "loss": 0.20325069427490233, "step": 640 }, { "epoch": 0.11735807860262008, "grad_norm": 0.13936272263526917, "learning_rate": 4.901205991639213e-05, "loss": 0.1930275321006775, "step": 645 }, { "epoch": 0.11826783114992721, "grad_norm": 0.1448420137166977, "learning_rate": 4.899146773943374e-05, "loss": 0.20026936531066894, "step": 650 }, { "epoch": 0.11917758369723436, "grad_norm": 0.1312534064054489, "learning_rate": 4.897066757968014e-05, "loss": 0.19062033891677857, "step": 655 }, { "epoch": 0.12008733624454149, "grad_norm": 0.13644742965698242, "learning_rate": 4.894965961744859e-05, "loss": 0.18719595670700073, "step": 660 }, { "epoch": 0.12099708879184862, "grad_norm": 0.14276087284088135, "learning_rate": 4.892844403485777e-05, "loss": 0.19784307479858398, "step": 665 }, { "epoch": 0.12190684133915575, "grad_norm": 0.14735399186611176, "learning_rate": 4.890702101582623e-05, "loss": 0.19163782596588136, "step": 670 }, { "epoch": 0.12281659388646288, "grad_norm": 0.15742065012454987, "learning_rate": 4.888539074607082e-05, "loss": 0.19312986135482788, "step": 675 }, { "epoch": 0.12372634643377002, "grad_norm": 0.12917031347751617, "learning_rate": 4.8863553413105025e-05, "loss": 0.20066320896148682, "step": 680 }, { "epoch": 0.12463609898107715, "grad_norm": 0.1484801322221756, "learning_rate": 4.884150920623737e-05, "loss": 0.20096096992492676, "step": 685 }, { "epoch": 0.12554585152838427, "grad_norm": 0.1455296128988266, "learning_rate": 4.88192583165698e-05, "loss": 0.20518505573272705, "step": 690 }, { "epoch": 0.12645560407569142, "grad_norm": 0.14517490565776825, "learning_rate": 4.879680093699598e-05, "loss": 0.18859238624572755, "step": 695 }, { "epoch": 0.12736535662299855, "grad_norm": 0.18778090178966522, "learning_rate": 4.877413726219964e-05, "loss": 0.197074818611145, "step": 700 }, { "epoch": 0.12827510917030568, "grad_norm": 0.13497677445411682, "learning_rate": 4.87512674886529e-05, "loss": 0.18713107109069824, "step": 705 }, { "epoch": 0.12918486171761281, "grad_norm": 0.12657155096530914, "learning_rate": 4.872819181461455e-05, "loss": 0.1858484387397766, "step": 710 }, { "epoch": 0.13009461426491994, "grad_norm": 0.11458148807287216, "learning_rate": 4.870491044012834e-05, "loss": 0.18732179403305055, "step": 715 }, { "epoch": 0.13100436681222707, "grad_norm": 0.13000249862670898, "learning_rate": 4.8681423567021244e-05, "loss": 0.1872936010360718, "step": 720 }, { "epoch": 0.1319141193595342, "grad_norm": 0.14580890536308289, "learning_rate": 4.865773139890172e-05, "loss": 0.19280019998550416, "step": 725 }, { "epoch": 0.13282387190684133, "grad_norm": 0.1507277935743332, "learning_rate": 4.8633834141157913e-05, "loss": 0.1898929238319397, "step": 730 }, { "epoch": 0.13373362445414846, "grad_norm": 0.1418737769126892, "learning_rate": 4.860973200095592e-05, "loss": 0.17926375865936278, "step": 735 }, { "epoch": 0.1346433770014556, "grad_norm": 0.17151866853237152, "learning_rate": 4.858542518723794e-05, "loss": 0.18963592052459716, "step": 740 }, { "epoch": 0.13555312954876272, "grad_norm": 0.11162743717432022, "learning_rate": 4.8560913910720535e-05, "loss": 0.19466646909713745, "step": 745 }, { "epoch": 0.13646288209606988, "grad_norm": 0.15628376603126526, "learning_rate": 4.8536198383892725e-05, "loss": 0.19494034051895143, "step": 750 }, { "epoch": 0.137372634643377, "grad_norm": 0.18209289014339447, "learning_rate": 4.851127882101421e-05, "loss": 0.18747550249099731, "step": 755 }, { "epoch": 0.13828238719068414, "grad_norm": 0.14559614658355713, "learning_rate": 4.8486155438113454e-05, "loss": 0.1897158980369568, "step": 760 }, { "epoch": 0.13919213973799127, "grad_norm": 0.3198587894439697, "learning_rate": 4.846082845298586e-05, "loss": 0.18571001291275024, "step": 765 }, { "epoch": 0.1401018922852984, "grad_norm": 0.1486678421497345, "learning_rate": 4.843529808519189e-05, "loss": 0.19561930894851684, "step": 770 }, { "epoch": 0.14101164483260553, "grad_norm": 0.15318170189857483, "learning_rate": 4.840956455605509e-05, "loss": 0.187040114402771, "step": 775 }, { "epoch": 0.14192139737991266, "grad_norm": 0.13754244148731232, "learning_rate": 4.838362808866025e-05, "loss": 0.18345539569854735, "step": 780 }, { "epoch": 0.1428311499272198, "grad_norm": 0.12943248450756073, "learning_rate": 4.835748890785143e-05, "loss": 0.1921079397201538, "step": 785 }, { "epoch": 0.14374090247452692, "grad_norm": 0.110458143055439, "learning_rate": 4.833114724023001e-05, "loss": 0.17927205562591553, "step": 790 }, { "epoch": 0.14465065502183405, "grad_norm": 0.2421770840883255, "learning_rate": 4.830460331415275e-05, "loss": 0.18317567110061644, "step": 795 }, { "epoch": 0.14556040756914118, "grad_norm": 0.14752762019634247, "learning_rate": 4.8277857359729787e-05, "loss": 0.1843916058540344, "step": 800 }, { "epoch": 0.14647016011644834, "grad_norm": 0.15043556690216064, "learning_rate": 4.8250909608822644e-05, "loss": 0.18354393243789674, "step": 805 }, { "epoch": 0.14737991266375547, "grad_norm": 0.1381794661283493, "learning_rate": 4.822376029504223e-05, "loss": 0.1789781332015991, "step": 810 }, { "epoch": 0.1482896652110626, "grad_norm": 0.18386174738407135, "learning_rate": 4.819640965374681e-05, "loss": 0.19494292736053467, "step": 815 }, { "epoch": 0.14919941775836973, "grad_norm": 0.13829593360424042, "learning_rate": 4.816885792203996e-05, "loss": 0.18486063480377196, "step": 820 }, { "epoch": 0.15010917030567686, "grad_norm": 0.15033291280269623, "learning_rate": 4.814110533876852e-05, "loss": 0.18061509132385253, "step": 825 }, { "epoch": 0.151018922852984, "grad_norm": 0.17150473594665527, "learning_rate": 4.811315214452051e-05, "loss": 0.18464866876602173, "step": 830 }, { "epoch": 0.15192867540029112, "grad_norm": 0.15317125618457794, "learning_rate": 4.808499858162307e-05, "loss": 0.1837708592414856, "step": 835 }, { "epoch": 0.15283842794759825, "grad_norm": 0.2671392560005188, "learning_rate": 4.805664489414031e-05, "loss": 0.19338636398315429, "step": 840 }, { "epoch": 0.15374818049490538, "grad_norm": 0.14047028124332428, "learning_rate": 4.802809132787125e-05, "loss": 0.17069108486175538, "step": 845 }, { "epoch": 0.1546579330422125, "grad_norm": 0.1520431935787201, "learning_rate": 4.799933813034768e-05, "loss": 0.18607735633850098, "step": 850 }, { "epoch": 0.15556768558951964, "grad_norm": 0.17239463329315186, "learning_rate": 4.797038555083197e-05, "loss": 0.18069062232971192, "step": 855 }, { "epoch": 0.1564774381368268, "grad_norm": 0.1377955675125122, "learning_rate": 4.794123384031495e-05, "loss": 0.18870222568511963, "step": 860 }, { "epoch": 0.15738719068413393, "grad_norm": 0.15901461243629456, "learning_rate": 4.791188325151373e-05, "loss": 0.18128334283828734, "step": 865 }, { "epoch": 0.15829694323144106, "grad_norm": 0.14634132385253906, "learning_rate": 4.7882334038869495e-05, "loss": 0.1866163969039917, "step": 870 }, { "epoch": 0.1592066957787482, "grad_norm": 0.15361061692237854, "learning_rate": 4.785258645854529e-05, "loss": 0.17850807905197144, "step": 875 }, { "epoch": 0.16011644832605532, "grad_norm": 0.13751649856567383, "learning_rate": 4.782264076842385e-05, "loss": 0.17731113433837892, "step": 880 }, { "epoch": 0.16102620087336245, "grad_norm": 0.17909638583660126, "learning_rate": 4.7792497228105314e-05, "loss": 0.18344542980194092, "step": 885 }, { "epoch": 0.16193595342066958, "grad_norm": 0.16038304567337036, "learning_rate": 4.776215609890498e-05, "loss": 0.18868647813796996, "step": 890 }, { "epoch": 0.1628457059679767, "grad_norm": 0.1653951108455658, "learning_rate": 4.773161764385107e-05, "loss": 0.18614152669906617, "step": 895 }, { "epoch": 0.16375545851528384, "grad_norm": 0.16193026304244995, "learning_rate": 4.770088212768241e-05, "loss": 0.18564575910568237, "step": 900 }, { "epoch": 0.16466521106259097, "grad_norm": 0.16048531234264374, "learning_rate": 4.7669949816846173e-05, "loss": 0.18330031633377075, "step": 905 }, { "epoch": 0.1655749636098981, "grad_norm": 0.1440177708864212, "learning_rate": 4.7638820979495534e-05, "loss": 0.17712442874908446, "step": 910 }, { "epoch": 0.16648471615720525, "grad_norm": 0.19635969400405884, "learning_rate": 4.760749588548738e-05, "loss": 0.18679027557373046, "step": 915 }, { "epoch": 0.16739446870451238, "grad_norm": 0.15576541423797607, "learning_rate": 4.757597480637995e-05, "loss": 0.19283764362335204, "step": 920 }, { "epoch": 0.1683042212518195, "grad_norm": 0.1550331562757492, "learning_rate": 4.7544258015430463e-05, "loss": 0.18269542455673218, "step": 925 }, { "epoch": 0.16921397379912664, "grad_norm": 0.18369626998901367, "learning_rate": 4.75123457875928e-05, "loss": 0.1697891116142273, "step": 930 }, { "epoch": 0.17012372634643377, "grad_norm": 0.15266314148902893, "learning_rate": 4.7480238399515074e-05, "loss": 0.18523451089859008, "step": 935 }, { "epoch": 0.1710334788937409, "grad_norm": 0.16709664463996887, "learning_rate": 4.744793612953724e-05, "loss": 0.1803238034248352, "step": 940 }, { "epoch": 0.17194323144104803, "grad_norm": 0.14929179847240448, "learning_rate": 4.741543925768872e-05, "loss": 0.1861217737197876, "step": 945 }, { "epoch": 0.17285298398835516, "grad_norm": 0.1362280696630478, "learning_rate": 4.7382748065685915e-05, "loss": 0.17896100282669067, "step": 950 }, { "epoch": 0.1737627365356623, "grad_norm": 0.15290239453315735, "learning_rate": 4.734986283692982e-05, "loss": 0.18432788848876952, "step": 955 }, { "epoch": 0.17467248908296942, "grad_norm": 0.1287035197019577, "learning_rate": 4.73167838565035e-05, "loss": 0.18485682010650634, "step": 960 }, { "epoch": 0.17558224163027655, "grad_norm": 0.17969627678394318, "learning_rate": 4.728351141116971e-05, "loss": 0.17361557483673096, "step": 965 }, { "epoch": 0.1764919941775837, "grad_norm": 0.13751201331615448, "learning_rate": 4.7250045789368326e-05, "loss": 0.1731679320335388, "step": 970 }, { "epoch": 0.17740174672489084, "grad_norm": 0.1603265255689621, "learning_rate": 4.721638728121388e-05, "loss": 0.17308170795440675, "step": 975 }, { "epoch": 0.17831149927219797, "grad_norm": 0.1592789888381958, "learning_rate": 4.718253617849306e-05, "loss": 0.17534757852554322, "step": 980 }, { "epoch": 0.1792212518195051, "grad_norm": 0.12727224826812744, "learning_rate": 4.714849277466214e-05, "loss": 0.17817609310150145, "step": 985 }, { "epoch": 0.18013100436681223, "grad_norm": 0.15401554107666016, "learning_rate": 4.711425736484447e-05, "loss": 0.1733405351638794, "step": 990 }, { "epoch": 0.18104075691411936, "grad_norm": 0.13253968954086304, "learning_rate": 4.7079830245827906e-05, "loss": 0.17846795320510864, "step": 995 }, { "epoch": 0.1819505094614265, "grad_norm": 0.21846213936805725, "learning_rate": 4.7045211716062245e-05, "loss": 0.18021599054336548, "step": 1000 }, { "epoch": 0.18286026200873362, "grad_norm": 0.16867990791797638, "learning_rate": 4.7010402075656595e-05, "loss": 0.18232386112213134, "step": 1005 }, { "epoch": 0.18377001455604075, "grad_norm": 0.17180582880973816, "learning_rate": 4.697540162637686e-05, "loss": 0.1816317319869995, "step": 1010 }, { "epoch": 0.18467976710334788, "grad_norm": 0.16480213403701782, "learning_rate": 4.694021067164303e-05, "loss": 0.17718446254730225, "step": 1015 }, { "epoch": 0.185589519650655, "grad_norm": 0.15015918016433716, "learning_rate": 4.6904829516526605e-05, "loss": 0.17412011623382567, "step": 1020 }, { "epoch": 0.18649927219796217, "grad_norm": 0.14445139467716217, "learning_rate": 4.686925846774795e-05, "loss": 0.1778018832206726, "step": 1025 }, { "epoch": 0.1874090247452693, "grad_norm": 0.1701960265636444, "learning_rate": 4.683349783367362e-05, "loss": 0.16901081800460815, "step": 1030 }, { "epoch": 0.18831877729257643, "grad_norm": 0.15894867479801178, "learning_rate": 4.679754792431368e-05, "loss": 0.17055928707122803, "step": 1035 }, { "epoch": 0.18922852983988356, "grad_norm": 0.1511942446231842, "learning_rate": 4.676140905131903e-05, "loss": 0.17339680194854737, "step": 1040 }, { "epoch": 0.1901382823871907, "grad_norm": 0.14735209941864014, "learning_rate": 4.672508152797872e-05, "loss": 0.17802717685699462, "step": 1045 }, { "epoch": 0.19104803493449782, "grad_norm": 0.17367291450500488, "learning_rate": 4.66885656692172e-05, "loss": 0.1732744097709656, "step": 1050 }, { "epoch": 0.19195778748180495, "grad_norm": 0.147227481007576, "learning_rate": 4.665186179159159e-05, "loss": 0.17040517330169677, "step": 1055 }, { "epoch": 0.19286754002911208, "grad_norm": 0.1709655076265335, "learning_rate": 4.6614970213289e-05, "loss": 0.17794088125228882, "step": 1060 }, { "epoch": 0.1937772925764192, "grad_norm": 0.1588088721036911, "learning_rate": 4.657789125412366e-05, "loss": 0.17180380821228028, "step": 1065 }, { "epoch": 0.19468704512372634, "grad_norm": 0.14827021956443787, "learning_rate": 4.654062523553428e-05, "loss": 0.182997989654541, "step": 1070 }, { "epoch": 0.19559679767103347, "grad_norm": 0.16230466961860657, "learning_rate": 4.6503172480581126e-05, "loss": 0.17346880435943604, "step": 1075 }, { "epoch": 0.1965065502183406, "grad_norm": 0.1637624353170395, "learning_rate": 4.646553331394333e-05, "loss": 0.17263576984405518, "step": 1080 }, { "epoch": 0.19741630276564776, "grad_norm": 0.15977843105793, "learning_rate": 4.642770806191603e-05, "loss": 0.17284308671951293, "step": 1085 }, { "epoch": 0.19832605531295489, "grad_norm": 0.15394869446754456, "learning_rate": 4.6389697052407534e-05, "loss": 0.17797101736068727, "step": 1090 }, { "epoch": 0.19923580786026202, "grad_norm": 0.15995225310325623, "learning_rate": 4.6351500614936485e-05, "loss": 0.18137198686599731, "step": 1095 }, { "epoch": 0.20014556040756915, "grad_norm": 0.1779479682445526, "learning_rate": 4.6313119080629006e-05, "loss": 0.17998344898223878, "step": 1100 }, { "epoch": 0.20105531295487628, "grad_norm": 0.14362832903862, "learning_rate": 4.627455278221584e-05, "loss": 0.18196423053741456, "step": 1105 }, { "epoch": 0.2019650655021834, "grad_norm": 0.15951639413833618, "learning_rate": 4.623580205402947e-05, "loss": 0.17423888444900512, "step": 1110 }, { "epoch": 0.20287481804949054, "grad_norm": 0.17273563146591187, "learning_rate": 4.619686723200115e-05, "loss": 0.17392473220825194, "step": 1115 }, { "epoch": 0.20378457059679767, "grad_norm": 0.1655360758304596, "learning_rate": 4.615774865365813e-05, "loss": 0.17528389692306517, "step": 1120 }, { "epoch": 0.2046943231441048, "grad_norm": 0.15920691192150116, "learning_rate": 4.611844665812058e-05, "loss": 0.1806849241256714, "step": 1125 }, { "epoch": 0.20560407569141192, "grad_norm": 0.16114577651023865, "learning_rate": 4.607896158609875e-05, "loss": 0.17217352390289306, "step": 1130 }, { "epoch": 0.20651382823871905, "grad_norm": 0.1499422937631607, "learning_rate": 4.603929377988999e-05, "loss": 0.17806737422943114, "step": 1135 }, { "epoch": 0.2074235807860262, "grad_norm": 0.17605191469192505, "learning_rate": 4.5999443583375765e-05, "loss": 0.17842113971710205, "step": 1140 }, { "epoch": 0.20833333333333334, "grad_norm": 0.16117210686206818, "learning_rate": 4.595941134201871e-05, "loss": 0.18379683494567872, "step": 1145 }, { "epoch": 0.20924308588064047, "grad_norm": 0.21199050545692444, "learning_rate": 4.591919740285957e-05, "loss": 0.16286123991012574, "step": 1150 }, { "epoch": 0.2101528384279476, "grad_norm": 0.15100529789924622, "learning_rate": 4.587880211451427e-05, "loss": 0.17995200157165528, "step": 1155 }, { "epoch": 0.21106259097525473, "grad_norm": 0.16618172824382782, "learning_rate": 4.583822582717085e-05, "loss": 0.16960303783416747, "step": 1160 }, { "epoch": 0.21197234352256186, "grad_norm": 0.14743569493293762, "learning_rate": 4.579746889258643e-05, "loss": 0.17762668132781984, "step": 1165 }, { "epoch": 0.212882096069869, "grad_norm": 0.1697179079055786, "learning_rate": 4.575653166408417e-05, "loss": 0.16656005382537842, "step": 1170 }, { "epoch": 0.21379184861717612, "grad_norm": 0.14886513352394104, "learning_rate": 4.57154144965502e-05, "loss": 0.17091882228851318, "step": 1175 }, { "epoch": 0.21470160116448325, "grad_norm": 0.18197473883628845, "learning_rate": 4.5674117746430556e-05, "loss": 0.1770920753479004, "step": 1180 }, { "epoch": 0.21561135371179038, "grad_norm": 0.17323088645935059, "learning_rate": 4.563264177172807e-05, "loss": 0.1734643578529358, "step": 1185 }, { "epoch": 0.2165211062590975, "grad_norm": 0.1521984338760376, "learning_rate": 4.559098693199929e-05, "loss": 0.17515116930007935, "step": 1190 }, { "epoch": 0.21743085880640467, "grad_norm": 0.1842304915189743, "learning_rate": 4.554915358835134e-05, "loss": 0.16798022985458375, "step": 1195 }, { "epoch": 0.2183406113537118, "grad_norm": 0.14753451943397522, "learning_rate": 4.5507142103438794e-05, "loss": 0.1755476713180542, "step": 1200 }, { "epoch": 0.21925036390101893, "grad_norm": 0.17096194624900818, "learning_rate": 4.546495284146057e-05, "loss": 0.1792473554611206, "step": 1205 }, { "epoch": 0.22016011644832606, "grad_norm": 0.1579233556985855, "learning_rate": 4.542258616815669e-05, "loss": 0.17230144739151002, "step": 1210 }, { "epoch": 0.2210698689956332, "grad_norm": 0.177297905087471, "learning_rate": 4.5380042450805216e-05, "loss": 0.1807127833366394, "step": 1215 }, { "epoch": 0.22197962154294032, "grad_norm": 0.14331696927547455, "learning_rate": 4.533732205821897e-05, "loss": 0.17201389074325563, "step": 1220 }, { "epoch": 0.22288937409024745, "grad_norm": 0.14473360776901245, "learning_rate": 4.529442536074239e-05, "loss": 0.17036900520324708, "step": 1225 }, { "epoch": 0.22379912663755458, "grad_norm": 0.1820901483297348, "learning_rate": 4.5251352730248314e-05, "loss": 0.17704882621765136, "step": 1230 }, { "epoch": 0.2247088791848617, "grad_norm": 0.1948976367712021, "learning_rate": 4.5208104540134746e-05, "loss": 0.1706973433494568, "step": 1235 }, { "epoch": 0.22561863173216884, "grad_norm": 0.16660070419311523, "learning_rate": 4.51646811653216e-05, "loss": 0.17636821269989014, "step": 1240 }, { "epoch": 0.22652838427947597, "grad_norm": 0.1699984073638916, "learning_rate": 4.512108298224751e-05, "loss": 0.16986632347106934, "step": 1245 }, { "epoch": 0.22743813682678313, "grad_norm": 0.17601042985916138, "learning_rate": 4.50773103688665e-05, "loss": 0.17507898807525635, "step": 1250 }, { "epoch": 0.22834788937409026, "grad_norm": 0.17557238042354584, "learning_rate": 4.503336370464476e-05, "loss": 0.17702863216400147, "step": 1255 }, { "epoch": 0.2292576419213974, "grad_norm": 0.1800651252269745, "learning_rate": 4.498924337055729e-05, "loss": 0.16419180631637573, "step": 1260 }, { "epoch": 0.23016739446870452, "grad_norm": 0.2022479772567749, "learning_rate": 4.494494974908468e-05, "loss": 0.17482060194015503, "step": 1265 }, { "epoch": 0.23107714701601165, "grad_norm": 0.14180205762386322, "learning_rate": 4.490048322420973e-05, "loss": 0.1723136067390442, "step": 1270 }, { "epoch": 0.23198689956331878, "grad_norm": 0.18607310950756073, "learning_rate": 4.485584418141419e-05, "loss": 0.17096419334411622, "step": 1275 }, { "epoch": 0.2328966521106259, "grad_norm": 0.15958310663700104, "learning_rate": 4.481103300767529e-05, "loss": 0.1656244158744812, "step": 1280 }, { "epoch": 0.23380640465793304, "grad_norm": 0.17552383244037628, "learning_rate": 4.476605009146255e-05, "loss": 0.17677626609802247, "step": 1285 }, { "epoch": 0.23471615720524017, "grad_norm": 0.15299823880195618, "learning_rate": 4.472089582273429e-05, "loss": 0.1778991103172302, "step": 1290 }, { "epoch": 0.2356259097525473, "grad_norm": 0.14613987505435944, "learning_rate": 4.46755705929343e-05, "loss": 0.17071452140808105, "step": 1295 }, { "epoch": 0.23653566229985443, "grad_norm": 0.17781122028827667, "learning_rate": 4.463007479498843e-05, "loss": 0.16955430507659913, "step": 1300 }, { "epoch": 0.23744541484716158, "grad_norm": 0.16326487064361572, "learning_rate": 4.458440882330119e-05, "loss": 0.1777693510055542, "step": 1305 }, { "epoch": 0.23835516739446871, "grad_norm": 0.17701926827430725, "learning_rate": 4.4538573073752365e-05, "loss": 0.16323351860046387, "step": 1310 }, { "epoch": 0.23926491994177584, "grad_norm": 0.13104717433452606, "learning_rate": 4.449256794369349e-05, "loss": 0.17653456926345826, "step": 1315 }, { "epoch": 0.24017467248908297, "grad_norm": 0.1796836256980896, "learning_rate": 4.444639383194452e-05, "loss": 0.17189600467681884, "step": 1320 }, { "epoch": 0.2410844250363901, "grad_norm": 0.14919696748256683, "learning_rate": 4.440005113879029e-05, "loss": 0.17003334760665895, "step": 1325 }, { "epoch": 0.24199417758369723, "grad_norm": 0.1728784441947937, "learning_rate": 4.4353540265977064e-05, "loss": 0.17397408485412597, "step": 1330 }, { "epoch": 0.24290393013100436, "grad_norm": 0.14591015875339508, "learning_rate": 4.43068616167091e-05, "loss": 0.16498478651046752, "step": 1335 }, { "epoch": 0.2438136826783115, "grad_norm": 0.18417201936244965, "learning_rate": 4.4260015595645055e-05, "loss": 0.16841750144958495, "step": 1340 }, { "epoch": 0.24472343522561862, "grad_norm": 0.16264279186725616, "learning_rate": 4.4213002608894605e-05, "loss": 0.16907373666763306, "step": 1345 }, { "epoch": 0.24563318777292575, "grad_norm": 0.15248481929302216, "learning_rate": 4.416582306401481e-05, "loss": 0.15931472778320313, "step": 1350 }, { "epoch": 0.24654294032023288, "grad_norm": 0.1488373875617981, "learning_rate": 4.4118477370006636e-05, "loss": 0.1701716423034668, "step": 1355 }, { "epoch": 0.24745269286754004, "grad_norm": 0.14679782092571259, "learning_rate": 4.407096593731142e-05, "loss": 0.157412326335907, "step": 1360 }, { "epoch": 0.24836244541484717, "grad_norm": 0.17139530181884766, "learning_rate": 4.402328917780728e-05, "loss": 0.17303754091262818, "step": 1365 }, { "epoch": 0.2492721979621543, "grad_norm": 0.1534871757030487, "learning_rate": 4.397544750480554e-05, "loss": 0.1786255121231079, "step": 1370 }, { "epoch": 0.2501819505094614, "grad_norm": 0.1876252293586731, "learning_rate": 4.39274413330472e-05, "loss": 0.16442898511886597, "step": 1375 }, { "epoch": 0.25109170305676853, "grad_norm": 0.16165752708911896, "learning_rate": 4.387927107869928e-05, "loss": 0.1780426025390625, "step": 1380 }, { "epoch": 0.25200145560407566, "grad_norm": 0.17242255806922913, "learning_rate": 4.383093715935124e-05, "loss": 0.15959256887435913, "step": 1385 }, { "epoch": 0.25291120815138285, "grad_norm": 0.1627114862203598, "learning_rate": 4.378243999401137e-05, "loss": 0.17606115341186523, "step": 1390 }, { "epoch": 0.25382096069869, "grad_norm": 0.15911224484443665, "learning_rate": 4.373378000310312e-05, "loss": 0.16798585653305054, "step": 1395 }, { "epoch": 0.2547307132459971, "grad_norm": 0.15542249381542206, "learning_rate": 4.3684957608461505e-05, "loss": 0.1695417881011963, "step": 1400 }, { "epoch": 0.25564046579330424, "grad_norm": 0.1475304812192917, "learning_rate": 4.363597323332941e-05, "loss": 0.16340878009796142, "step": 1405 }, { "epoch": 0.25655021834061137, "grad_norm": 0.16943927109241486, "learning_rate": 4.358682730235395e-05, "loss": 0.17240238189697266, "step": 1410 }, { "epoch": 0.2574599708879185, "grad_norm": 0.1816391944885254, "learning_rate": 4.3537520241582744e-05, "loss": 0.16558437347412108, "step": 1415 }, { "epoch": 0.25836972343522563, "grad_norm": 0.23851341009140015, "learning_rate": 4.348805247846027e-05, "loss": 0.16796000003814698, "step": 1420 }, { "epoch": 0.25927947598253276, "grad_norm": 0.15415243804454803, "learning_rate": 4.343842444182414e-05, "loss": 0.1746017098426819, "step": 1425 }, { "epoch": 0.2601892285298399, "grad_norm": 0.15651032328605652, "learning_rate": 4.338863656190139e-05, "loss": 0.1649057984352112, "step": 1430 }, { "epoch": 0.261098981077147, "grad_norm": 0.16601966321468353, "learning_rate": 4.333868927030471e-05, "loss": 0.15888988971710205, "step": 1435 }, { "epoch": 0.26200873362445415, "grad_norm": 0.1549467295408249, "learning_rate": 4.328858300002876e-05, "loss": 0.16357985734939576, "step": 1440 }, { "epoch": 0.2629184861717613, "grad_norm": 0.16332370042800903, "learning_rate": 4.32383181854464e-05, "loss": 0.16749982833862304, "step": 1445 }, { "epoch": 0.2638282387190684, "grad_norm": 0.14827077090740204, "learning_rate": 4.3187895262304894e-05, "loss": 0.16886214017868043, "step": 1450 }, { "epoch": 0.26473799126637554, "grad_norm": 0.1557198166847229, "learning_rate": 4.313731466772216e-05, "loss": 0.17512214183807373, "step": 1455 }, { "epoch": 0.26564774381368267, "grad_norm": 0.17263570427894592, "learning_rate": 4.308657684018299e-05, "loss": 0.16248074769973755, "step": 1460 }, { "epoch": 0.2665574963609898, "grad_norm": 0.17135761678218842, "learning_rate": 4.303568221953521e-05, "loss": 0.16605921983718872, "step": 1465 }, { "epoch": 0.26746724890829693, "grad_norm": 0.14322632551193237, "learning_rate": 4.2984631246985897e-05, "loss": 0.1610772728919983, "step": 1470 }, { "epoch": 0.26837700145560406, "grad_norm": 0.18852312862873077, "learning_rate": 4.2933424365097564e-05, "loss": 0.1686462163925171, "step": 1475 }, { "epoch": 0.2692867540029112, "grad_norm": 0.1780245155096054, "learning_rate": 4.2882062017784294e-05, "loss": 0.16953932046890258, "step": 1480 }, { "epoch": 0.2701965065502183, "grad_norm": 0.180568665266037, "learning_rate": 4.2830544650307895e-05, "loss": 0.16442664861679077, "step": 1485 }, { "epoch": 0.27110625909752545, "grad_norm": 0.16876435279846191, "learning_rate": 4.277887270927407e-05, "loss": 0.17128173112869263, "step": 1490 }, { "epoch": 0.2720160116448326, "grad_norm": 0.164053276181221, "learning_rate": 4.2727046642628513e-05, "loss": 0.16331382989883422, "step": 1495 }, { "epoch": 0.27292576419213976, "grad_norm": 0.14577528834342957, "learning_rate": 4.267506689965305e-05, "loss": 0.1638316035270691, "step": 1500 }, { "epoch": 0.2738355167394469, "grad_norm": 0.1648740917444229, "learning_rate": 4.262293393096171e-05, "loss": 0.15332664251327516, "step": 1505 }, { "epoch": 0.274745269286754, "grad_norm": 0.16445094347000122, "learning_rate": 4.257064818849685e-05, "loss": 0.1706634521484375, "step": 1510 }, { "epoch": 0.27565502183406115, "grad_norm": 0.1584935486316681, "learning_rate": 4.251821012552524e-05, "loss": 0.1684114694595337, "step": 1515 }, { "epoch": 0.2765647743813683, "grad_norm": 0.17215611040592194, "learning_rate": 4.24656201966341e-05, "loss": 0.15594131946563722, "step": 1520 }, { "epoch": 0.2774745269286754, "grad_norm": 0.15945589542388916, "learning_rate": 4.2412878857727214e-05, "loss": 0.1686659574508667, "step": 1525 }, { "epoch": 0.27838427947598254, "grad_norm": 0.16103951632976532, "learning_rate": 4.2359986566020906e-05, "loss": 0.17779340744018554, "step": 1530 }, { "epoch": 0.2792940320232897, "grad_norm": 0.1770307570695877, "learning_rate": 4.230694378004014e-05, "loss": 0.16786882877349854, "step": 1535 }, { "epoch": 0.2802037845705968, "grad_norm": 0.16225053369998932, "learning_rate": 4.2253750959614504e-05, "loss": 0.16558897495269775, "step": 1540 }, { "epoch": 0.28111353711790393, "grad_norm": 0.27213969826698303, "learning_rate": 4.220040856587425e-05, "loss": 0.1641119599342346, "step": 1545 }, { "epoch": 0.28202328966521106, "grad_norm": 0.1773071587085724, "learning_rate": 4.2146917061246284e-05, "loss": 0.16919140815734862, "step": 1550 }, { "epoch": 0.2829330422125182, "grad_norm": 0.15519705414772034, "learning_rate": 4.209327690945014e-05, "loss": 0.15501506328582765, "step": 1555 }, { "epoch": 0.2838427947598253, "grad_norm": 0.19921597838401794, "learning_rate": 4.203948857549402e-05, "loss": 0.1690821886062622, "step": 1560 }, { "epoch": 0.28475254730713245, "grad_norm": 0.15417630970478058, "learning_rate": 4.1985552525670696e-05, "loss": 0.1675640344619751, "step": 1565 }, { "epoch": 0.2856622998544396, "grad_norm": 0.1739572137594223, "learning_rate": 4.193146922755348e-05, "loss": 0.16738017797470092, "step": 1570 }, { "epoch": 0.2865720524017467, "grad_norm": 0.1384361982345581, "learning_rate": 4.187723914999221e-05, "loss": 0.16802358627319336, "step": 1575 }, { "epoch": 0.28748180494905384, "grad_norm": 0.1491454839706421, "learning_rate": 4.182286276310915e-05, "loss": 0.1619583249092102, "step": 1580 }, { "epoch": 0.288391557496361, "grad_norm": 0.15831919014453888, "learning_rate": 4.176834053829492e-05, "loss": 0.1625199794769287, "step": 1585 }, { "epoch": 0.2893013100436681, "grad_norm": 0.16265396773815155, "learning_rate": 4.1713672948204416e-05, "loss": 0.16718552112579346, "step": 1590 }, { "epoch": 0.29021106259097523, "grad_norm": 0.15153461694717407, "learning_rate": 4.1658860466752714e-05, "loss": 0.15979087352752686, "step": 1595 }, { "epoch": 0.29112081513828236, "grad_norm": 0.1620412915945053, "learning_rate": 4.160390356911096e-05, "loss": 0.16103557348251343, "step": 1600 }, { "epoch": 0.2920305676855895, "grad_norm": 0.16673807799816132, "learning_rate": 4.154880273170223e-05, "loss": 0.16394708156585694, "step": 1605 }, { "epoch": 0.2929403202328967, "grad_norm": 0.14834867417812347, "learning_rate": 4.149355843219744e-05, "loss": 0.15916435718536376, "step": 1610 }, { "epoch": 0.2938500727802038, "grad_norm": 0.16977964341640472, "learning_rate": 4.143817114951119e-05, "loss": 0.16538127660751342, "step": 1615 }, { "epoch": 0.29475982532751094, "grad_norm": 0.17986875772476196, "learning_rate": 4.138264136379756e-05, "loss": 0.15514618158340454, "step": 1620 }, { "epoch": 0.29566957787481807, "grad_norm": 0.15794920921325684, "learning_rate": 4.132696955644605e-05, "loss": 0.15992183685302735, "step": 1625 }, { "epoch": 0.2965793304221252, "grad_norm": 0.19955399632453918, "learning_rate": 4.127115621007731e-05, "loss": 0.16362056732177735, "step": 1630 }, { "epoch": 0.29748908296943233, "grad_norm": 0.1352023035287857, "learning_rate": 4.121520180853903e-05, "loss": 0.15631601810455323, "step": 1635 }, { "epoch": 0.29839883551673946, "grad_norm": 0.15340781211853027, "learning_rate": 4.1159106836901674e-05, "loss": 0.1571858048439026, "step": 1640 }, { "epoch": 0.2993085880640466, "grad_norm": 0.15311770141124725, "learning_rate": 4.110287178145433e-05, "loss": 0.16082344055175782, "step": 1645 }, { "epoch": 0.3002183406113537, "grad_norm": 0.17811627686023712, "learning_rate": 4.10464971297005e-05, "loss": 0.16117215156555176, "step": 1650 }, { "epoch": 0.30112809315866085, "grad_norm": 0.21060039103031158, "learning_rate": 4.0989983370353805e-05, "loss": 0.15838587284088135, "step": 1655 }, { "epoch": 0.302037845705968, "grad_norm": 0.155836820602417, "learning_rate": 4.093333099333383e-05, "loss": 0.16648870706558228, "step": 1660 }, { "epoch": 0.3029475982532751, "grad_norm": 0.13711698353290558, "learning_rate": 4.0876540489761826e-05, "loss": 0.16899349689483642, "step": 1665 }, { "epoch": 0.30385735080058224, "grad_norm": 0.15162716805934906, "learning_rate": 4.0819612351956485e-05, "loss": 0.16574090719223022, "step": 1670 }, { "epoch": 0.30476710334788937, "grad_norm": 0.15016348659992218, "learning_rate": 4.0762547073429615e-05, "loss": 0.1689780354499817, "step": 1675 }, { "epoch": 0.3056768558951965, "grad_norm": 0.15182986855506897, "learning_rate": 4.070534514888194e-05, "loss": 0.1593686819076538, "step": 1680 }, { "epoch": 0.3065866084425036, "grad_norm": 0.15648750960826874, "learning_rate": 4.0648007074198765e-05, "loss": 0.16436235904693602, "step": 1685 }, { "epoch": 0.30749636098981076, "grad_norm": 0.18339484930038452, "learning_rate": 4.0590533346445665e-05, "loss": 0.1678077220916748, "step": 1690 }, { "epoch": 0.3084061135371179, "grad_norm": 0.16426527500152588, "learning_rate": 4.053292446386422e-05, "loss": 0.1689227342605591, "step": 1695 }, { "epoch": 0.309315866084425, "grad_norm": 0.16129335761070251, "learning_rate": 4.047518092586766e-05, "loss": 0.16592445373535156, "step": 1700 }, { "epoch": 0.31022561863173215, "grad_norm": 0.15512363612651825, "learning_rate": 4.041730323303654e-05, "loss": 0.16142364740371704, "step": 1705 }, { "epoch": 0.3111353711790393, "grad_norm": 0.159842386841774, "learning_rate": 4.0359291887114425e-05, "loss": 0.1702875852584839, "step": 1710 }, { "epoch": 0.3120451237263464, "grad_norm": 0.19558854401111603, "learning_rate": 4.030114739100352e-05, "loss": 0.15966148376464845, "step": 1715 }, { "epoch": 0.3129548762736536, "grad_norm": 0.1577496975660324, "learning_rate": 4.024287024876029e-05, "loss": 0.1620358943939209, "step": 1720 }, { "epoch": 0.3138646288209607, "grad_norm": 0.1629355251789093, "learning_rate": 4.0184460965591144e-05, "loss": 0.16511552333831786, "step": 1725 }, { "epoch": 0.31477438136826785, "grad_norm": 0.17060767114162445, "learning_rate": 4.0125920047848e-05, "loss": 0.15672838687896729, "step": 1730 }, { "epoch": 0.315684133915575, "grad_norm": 0.22447620332241058, "learning_rate": 4.006724800302394e-05, "loss": 0.15339784622192382, "step": 1735 }, { "epoch": 0.3165938864628821, "grad_norm": 0.14572037756443024, "learning_rate": 4.000844533974878e-05, "loss": 0.16566959619522095, "step": 1740 }, { "epoch": 0.31750363901018924, "grad_norm": 0.15915483236312866, "learning_rate": 3.9949512567784684e-05, "loss": 0.16153957843780517, "step": 1745 }, { "epoch": 0.3184133915574964, "grad_norm": 0.1668540984392166, "learning_rate": 3.9890450198021704e-05, "loss": 0.1659809947013855, "step": 1750 }, { "epoch": 0.3193231441048035, "grad_norm": 0.16612035036087036, "learning_rate": 3.983125874247341e-05, "loss": 0.16941241025924683, "step": 1755 }, { "epoch": 0.32023289665211063, "grad_norm": 0.15163679420948029, "learning_rate": 3.9771938714272407e-05, "loss": 0.16053590774536133, "step": 1760 }, { "epoch": 0.32114264919941776, "grad_norm": 0.1797824203968048, "learning_rate": 3.97124906276659e-05, "loss": 0.1667110800743103, "step": 1765 }, { "epoch": 0.3220524017467249, "grad_norm": 0.15076608955860138, "learning_rate": 3.9652914998011237e-05, "loss": 0.1607860803604126, "step": 1770 }, { "epoch": 0.322962154294032, "grad_norm": 0.16523587703704834, "learning_rate": 3.959321234177144e-05, "loss": 0.16515827178955078, "step": 1775 }, { "epoch": 0.32387190684133915, "grad_norm": 0.22065149247646332, "learning_rate": 3.9533383176510746e-05, "loss": 0.1618957757949829, "step": 1780 }, { "epoch": 0.3247816593886463, "grad_norm": 0.16426463425159454, "learning_rate": 3.9473428020890066e-05, "loss": 0.15763382911682128, "step": 1785 }, { "epoch": 0.3256914119359534, "grad_norm": 0.16474904119968414, "learning_rate": 3.941334739466257e-05, "loss": 0.15135571956634522, "step": 1790 }, { "epoch": 0.32660116448326054, "grad_norm": 0.16746412217617035, "learning_rate": 3.935314181866909e-05, "loss": 0.15925389528274536, "step": 1795 }, { "epoch": 0.32751091703056767, "grad_norm": 0.17819371819496155, "learning_rate": 3.929281181483369e-05, "loss": 0.1598669171333313, "step": 1800 }, { "epoch": 0.3284206695778748, "grad_norm": 0.1816040277481079, "learning_rate": 3.923235790615907e-05, "loss": 0.1652522087097168, "step": 1805 }, { "epoch": 0.32933042212518193, "grad_norm": 0.14846695959568024, "learning_rate": 3.917178061672211e-05, "loss": 0.16665585041046144, "step": 1810 }, { "epoch": 0.33024017467248906, "grad_norm": 0.1734926551580429, "learning_rate": 3.911108047166924e-05, "loss": 0.16069791316986085, "step": 1815 }, { "epoch": 0.3311499272197962, "grad_norm": 0.16154922544956207, "learning_rate": 3.905025799721194e-05, "loss": 0.16114097833633423, "step": 1820 }, { "epoch": 0.3320596797671033, "grad_norm": 0.1538771390914917, "learning_rate": 3.898931372062217e-05, "loss": 0.1602831244468689, "step": 1825 }, { "epoch": 0.3329694323144105, "grad_norm": 0.14036566019058228, "learning_rate": 3.892824817022781e-05, "loss": 0.1502395749092102, "step": 1830 }, { "epoch": 0.33387918486171764, "grad_norm": 0.19212059676647186, "learning_rate": 3.886706187540804e-05, "loss": 0.16265250444412233, "step": 1835 }, { "epoch": 0.33478893740902477, "grad_norm": 0.17410333454608917, "learning_rate": 3.880575536658881e-05, "loss": 0.15689224004745483, "step": 1840 }, { "epoch": 0.3356986899563319, "grad_norm": 0.15165294706821442, "learning_rate": 3.874432917523817e-05, "loss": 0.15033140182495117, "step": 1845 }, { "epoch": 0.336608442503639, "grad_norm": 0.16166730225086212, "learning_rate": 3.8682783833861736e-05, "loss": 0.16896235942840576, "step": 1850 }, { "epoch": 0.33751819505094616, "grad_norm": 0.16497021913528442, "learning_rate": 3.8621119875998026e-05, "loss": 0.1600774645805359, "step": 1855 }, { "epoch": 0.3384279475982533, "grad_norm": 0.17264948785305023, "learning_rate": 3.855933783621384e-05, "loss": 0.16947593688964843, "step": 1860 }, { "epoch": 0.3393377001455604, "grad_norm": 0.16870704293251038, "learning_rate": 3.8497438250099636e-05, "loss": 0.16062095165252685, "step": 1865 }, { "epoch": 0.34024745269286755, "grad_norm": 0.16644036769866943, "learning_rate": 3.843542165426492e-05, "loss": 0.16015599966049193, "step": 1870 }, { "epoch": 0.3411572052401747, "grad_norm": 0.1626352220773697, "learning_rate": 3.837328858633349e-05, "loss": 0.17444703578948975, "step": 1875 }, { "epoch": 0.3420669577874818, "grad_norm": 0.1427375227212906, "learning_rate": 3.83110395849389e-05, "loss": 0.1589805006980896, "step": 1880 }, { "epoch": 0.34297671033478894, "grad_norm": 0.17840255796909332, "learning_rate": 3.824867518971973e-05, "loss": 0.15953952074050903, "step": 1885 }, { "epoch": 0.34388646288209607, "grad_norm": 0.16998249292373657, "learning_rate": 3.818619594131489e-05, "loss": 0.16027032136917113, "step": 1890 }, { "epoch": 0.3447962154294032, "grad_norm": 0.14950257539749146, "learning_rate": 3.812360238135897e-05, "loss": 0.15335670709609986, "step": 1895 }, { "epoch": 0.3457059679767103, "grad_norm": 0.1678011417388916, "learning_rate": 3.806089505247752e-05, "loss": 0.1560648798942566, "step": 1900 }, { "epoch": 0.34661572052401746, "grad_norm": 0.17944541573524475, "learning_rate": 3.799807449828238e-05, "loss": 0.16072254180908202, "step": 1905 }, { "epoch": 0.3475254730713246, "grad_norm": 0.166817307472229, "learning_rate": 3.793514126336691e-05, "loss": 0.1542820692062378, "step": 1910 }, { "epoch": 0.3484352256186317, "grad_norm": 0.16047626733779907, "learning_rate": 3.787209589330134e-05, "loss": 0.16092092990875245, "step": 1915 }, { "epoch": 0.34934497816593885, "grad_norm": 0.16478900611400604, "learning_rate": 3.7808938934627965e-05, "loss": 0.16765867471694945, "step": 1920 }, { "epoch": 0.350254730713246, "grad_norm": 0.15349514782428741, "learning_rate": 3.774567093485648e-05, "loss": 0.15890377759933472, "step": 1925 }, { "epoch": 0.3511644832605531, "grad_norm": 0.1515921950340271, "learning_rate": 3.768229244245917e-05, "loss": 0.16668319702148438, "step": 1930 }, { "epoch": 0.35207423580786024, "grad_norm": 0.16310466825962067, "learning_rate": 3.7618804006866195e-05, "loss": 0.15182652473449706, "step": 1935 }, { "epoch": 0.3529839883551674, "grad_norm": 0.17294517159461975, "learning_rate": 3.755520617846084e-05, "loss": 0.16287628412246705, "step": 1940 }, { "epoch": 0.35389374090247455, "grad_norm": 0.1482895463705063, "learning_rate": 3.749149950857467e-05, "loss": 0.15321952104568481, "step": 1945 }, { "epoch": 0.3548034934497817, "grad_norm": 0.2236029952764511, "learning_rate": 3.7427684549482847e-05, "loss": 0.15403482913970948, "step": 1950 }, { "epoch": 0.3557132459970888, "grad_norm": 0.20185327529907227, "learning_rate": 3.736376185439927e-05, "loss": 0.1633884072303772, "step": 1955 }, { "epoch": 0.35662299854439594, "grad_norm": 0.13906247913837433, "learning_rate": 3.7299731977471816e-05, "loss": 0.15925350189208984, "step": 1960 }, { "epoch": 0.35753275109170307, "grad_norm": 0.18665002286434174, "learning_rate": 3.723559547377751e-05, "loss": 0.1612026572227478, "step": 1965 }, { "epoch": 0.3584425036390102, "grad_norm": 0.16913433372974396, "learning_rate": 3.717135289931774e-05, "loss": 0.15479494333267213, "step": 1970 }, { "epoch": 0.35935225618631733, "grad_norm": 0.1620066910982132, "learning_rate": 3.7107004811013434e-05, "loss": 0.1604058027267456, "step": 1975 }, { "epoch": 0.36026200873362446, "grad_norm": 0.16838301718235016, "learning_rate": 3.704255176670021e-05, "loss": 0.15335073471069335, "step": 1980 }, { "epoch": 0.3611717612809316, "grad_norm": 0.3054695427417755, "learning_rate": 3.6977994325123535e-05, "loss": 0.16558053493499755, "step": 1985 }, { "epoch": 0.3620815138282387, "grad_norm": 0.1526716649532318, "learning_rate": 3.6913333045933934e-05, "loss": 0.16148923635482787, "step": 1990 }, { "epoch": 0.36299126637554585, "grad_norm": 0.15328513085842133, "learning_rate": 3.684856848968209e-05, "loss": 0.1553613781929016, "step": 1995 }, { "epoch": 0.363901018922853, "grad_norm": 0.16129714250564575, "learning_rate": 3.6783701217813995e-05, "loss": 0.16724612712860107, "step": 2000 }, { "epoch": 0.3648107714701601, "grad_norm": 0.15715539455413818, "learning_rate": 3.6718731792666086e-05, "loss": 0.15867922306060792, "step": 2005 }, { "epoch": 0.36572052401746724, "grad_norm": 0.15569166839122772, "learning_rate": 3.6653660777460366e-05, "loss": 0.1552058696746826, "step": 2010 }, { "epoch": 0.36663027656477437, "grad_norm": 0.16223010420799255, "learning_rate": 3.6588488736299535e-05, "loss": 0.1583200454711914, "step": 2015 }, { "epoch": 0.3675400291120815, "grad_norm": 0.18441995978355408, "learning_rate": 3.652321623416209e-05, "loss": 0.15050662755966188, "step": 2020 }, { "epoch": 0.36844978165938863, "grad_norm": 0.13792674243450165, "learning_rate": 3.645784383689742e-05, "loss": 0.15458759069442748, "step": 2025 }, { "epoch": 0.36935953420669576, "grad_norm": 0.14993111789226532, "learning_rate": 3.639237211122091e-05, "loss": 0.15926222801208495, "step": 2030 }, { "epoch": 0.3702692867540029, "grad_norm": 0.16815930604934692, "learning_rate": 3.632680162470904e-05, "loss": 0.15524441003799438, "step": 2035 }, { "epoch": 0.37117903930131, "grad_norm": 0.13312821090221405, "learning_rate": 3.626113294579441e-05, "loss": 0.15883516073226928, "step": 2040 }, { "epoch": 0.37208879184861715, "grad_norm": 0.16838273406028748, "learning_rate": 3.619536664376091e-05, "loss": 0.15829603672027587, "step": 2045 }, { "epoch": 0.37299854439592434, "grad_norm": 0.14706873893737793, "learning_rate": 3.612950328873869e-05, "loss": 0.15644397735595703, "step": 2050 }, { "epoch": 0.37390829694323147, "grad_norm": 0.1644199639558792, "learning_rate": 3.606354345169926e-05, "loss": 0.15858219861984252, "step": 2055 }, { "epoch": 0.3748180494905386, "grad_norm": 0.18077051639556885, "learning_rate": 3.599748770445055e-05, "loss": 0.1641286849975586, "step": 2060 }, { "epoch": 0.3757278020378457, "grad_norm": 0.16329127550125122, "learning_rate": 3.5931336619631914e-05, "loss": 0.15027186870574952, "step": 2065 }, { "epoch": 0.37663755458515286, "grad_norm": 0.16346783936023712, "learning_rate": 3.586509077070922e-05, "loss": 0.1558641314506531, "step": 2070 }, { "epoch": 0.37754730713246, "grad_norm": 0.1727602630853653, "learning_rate": 3.5798750731969834e-05, "loss": 0.15390506982803345, "step": 2075 }, { "epoch": 0.3784570596797671, "grad_norm": 0.7598192691802979, "learning_rate": 3.5732317078517654e-05, "loss": 0.1533232808113098, "step": 2080 }, { "epoch": 0.37936681222707425, "grad_norm": 0.1433355212211609, "learning_rate": 3.5665790386268124e-05, "loss": 0.15560413599014283, "step": 2085 }, { "epoch": 0.3802765647743814, "grad_norm": 0.18439625203609467, "learning_rate": 3.559917123194325e-05, "loss": 0.16695556640625, "step": 2090 }, { "epoch": 0.3811863173216885, "grad_norm": 0.1693502813577652, "learning_rate": 3.55324601930666e-05, "loss": 0.15957870483398437, "step": 2095 }, { "epoch": 0.38209606986899564, "grad_norm": 0.17776088416576385, "learning_rate": 3.54656578479583e-05, "loss": 0.1527492880821228, "step": 2100 }, { "epoch": 0.38300582241630277, "grad_norm": 0.15993724763393402, "learning_rate": 3.539876477572998e-05, "loss": 0.1567505717277527, "step": 2105 }, { "epoch": 0.3839155749636099, "grad_norm": 0.17067375779151917, "learning_rate": 3.533178155627981e-05, "loss": 0.14660797119140626, "step": 2110 }, { "epoch": 0.384825327510917, "grad_norm": 0.20239882171154022, "learning_rate": 3.526470877028745e-05, "loss": 0.1596767544746399, "step": 2115 }, { "epoch": 0.38573508005822416, "grad_norm": 0.1863643079996109, "learning_rate": 3.5197546999209005e-05, "loss": 0.15738571882247926, "step": 2120 }, { "epoch": 0.3866448326055313, "grad_norm": 0.16994133591651917, "learning_rate": 3.5130296825272014e-05, "loss": 0.16255316734313965, "step": 2125 }, { "epoch": 0.3875545851528384, "grad_norm": 0.18703415989875793, "learning_rate": 3.5062958831470355e-05, "loss": 0.15206334590911866, "step": 2130 }, { "epoch": 0.38846433770014555, "grad_norm": 0.15433982014656067, "learning_rate": 3.4995533601559226e-05, "loss": 0.1590178370475769, "step": 2135 }, { "epoch": 0.3893740902474527, "grad_norm": 0.16498146951198578, "learning_rate": 3.4928021720050104e-05, "loss": 0.14759145975112914, "step": 2140 }, { "epoch": 0.3902838427947598, "grad_norm": 0.17880478501319885, "learning_rate": 3.486042377220562e-05, "loss": 0.1642458915710449, "step": 2145 }, { "epoch": 0.39119359534206694, "grad_norm": 0.14700061082839966, "learning_rate": 3.479274034403455e-05, "loss": 0.16105138063430785, "step": 2150 }, { "epoch": 0.39210334788937407, "grad_norm": 0.1620762050151825, "learning_rate": 3.472497202228664e-05, "loss": 0.15104985237121582, "step": 2155 }, { "epoch": 0.3930131004366812, "grad_norm": 0.1625058799982071, "learning_rate": 3.4657119394447654e-05, "loss": 0.16145485639572144, "step": 2160 }, { "epoch": 0.3939228529839884, "grad_norm": 0.1631549596786499, "learning_rate": 3.458918304873417e-05, "loss": 0.16712255477905275, "step": 2165 }, { "epoch": 0.3948326055312955, "grad_norm": 0.16041551530361176, "learning_rate": 3.452116357408853e-05, "loss": 0.15118330717086792, "step": 2170 }, { "epoch": 0.39574235807860264, "grad_norm": 0.16692611575126648, "learning_rate": 3.44530615601737e-05, "loss": 0.16982550621032716, "step": 2175 }, { "epoch": 0.39665211062590977, "grad_norm": 0.16082268953323364, "learning_rate": 3.438487759736821e-05, "loss": 0.1513260006904602, "step": 2180 }, { "epoch": 0.3975618631732169, "grad_norm": 0.1474589854478836, "learning_rate": 3.4316612276761004e-05, "loss": 0.14968743324279785, "step": 2185 }, { "epoch": 0.39847161572052403, "grad_norm": 0.14531342685222626, "learning_rate": 3.42482661901463e-05, "loss": 0.1563260555267334, "step": 2190 }, { "epoch": 0.39938136826783116, "grad_norm": 0.16775506734848022, "learning_rate": 3.41798399300185e-05, "loss": 0.14861010313034057, "step": 2195 }, { "epoch": 0.4002911208151383, "grad_norm": 0.15065217018127441, "learning_rate": 3.411133408956703e-05, "loss": 0.15559519529342652, "step": 2200 }, { "epoch": 0.4012008733624454, "grad_norm": 0.16655296087265015, "learning_rate": 3.4042749262671184e-05, "loss": 0.16025567054748535, "step": 2205 }, { "epoch": 0.40211062590975255, "grad_norm": 0.14773905277252197, "learning_rate": 3.397408604389501e-05, "loss": 0.15074082612991332, "step": 2210 }, { "epoch": 0.4030203784570597, "grad_norm": 0.16233304142951965, "learning_rate": 3.3905345028482125e-05, "loss": 0.15490520000457764, "step": 2215 }, { "epoch": 0.4039301310043668, "grad_norm": 0.17520153522491455, "learning_rate": 3.383652681235058e-05, "loss": 0.1517520785331726, "step": 2220 }, { "epoch": 0.40483988355167394, "grad_norm": 0.14749875664710999, "learning_rate": 3.376763199208766e-05, "loss": 0.15410997867584228, "step": 2225 }, { "epoch": 0.40574963609898107, "grad_norm": 0.16855919361114502, "learning_rate": 3.369866116494477e-05, "loss": 0.1510261058807373, "step": 2230 }, { "epoch": 0.4066593886462882, "grad_norm": 0.1594122350215912, "learning_rate": 3.362961492883218e-05, "loss": 0.1493813395500183, "step": 2235 }, { "epoch": 0.40756914119359533, "grad_norm": 0.13645926117897034, "learning_rate": 3.3560493882313915e-05, "loss": 0.14876762628555298, "step": 2240 }, { "epoch": 0.40847889374090246, "grad_norm": 0.14304400980472565, "learning_rate": 3.349129862460251e-05, "loss": 0.15567013025283813, "step": 2245 }, { "epoch": 0.4093886462882096, "grad_norm": 0.17040041089057922, "learning_rate": 3.342202975555386e-05, "loss": 0.1563249945640564, "step": 2250 }, { "epoch": 0.4102983988355167, "grad_norm": 0.15594671666622162, "learning_rate": 3.3352687875661984e-05, "loss": 0.1546410083770752, "step": 2255 }, { "epoch": 0.41120815138282385, "grad_norm": 0.1677195280790329, "learning_rate": 3.328327358605384e-05, "loss": 0.15710171461105346, "step": 2260 }, { "epoch": 0.412117903930131, "grad_norm": 0.1731705516576767, "learning_rate": 3.321378748848412e-05, "loss": 0.16444036960601807, "step": 2265 }, { "epoch": 0.4130276564774381, "grad_norm": 0.18779033422470093, "learning_rate": 3.3144230185329984e-05, "loss": 0.15659687519073487, "step": 2270 }, { "epoch": 0.4139374090247453, "grad_norm": 0.1543768346309662, "learning_rate": 3.3074602279585913e-05, "loss": 0.15100739002227784, "step": 2275 }, { "epoch": 0.4148471615720524, "grad_norm": 0.16672168672084808, "learning_rate": 3.300490437485843e-05, "loss": 0.15535364151000977, "step": 2280 }, { "epoch": 0.41575691411935956, "grad_norm": 0.16741308569908142, "learning_rate": 3.293513707536089e-05, "loss": 0.15523911714553834, "step": 2285 }, { "epoch": 0.4166666666666667, "grad_norm": 0.1488303542137146, "learning_rate": 3.286530098590822e-05, "loss": 0.1542000651359558, "step": 2290 }, { "epoch": 0.4175764192139738, "grad_norm": 0.1637732982635498, "learning_rate": 3.2795396711911694e-05, "loss": 0.15354831218719484, "step": 2295 }, { "epoch": 0.41848617176128095, "grad_norm": 0.1472022533416748, "learning_rate": 3.272542485937369e-05, "loss": 0.16235145330429077, "step": 2300 }, { "epoch": 0.4193959243085881, "grad_norm": 0.15908290445804596, "learning_rate": 3.265538603488241e-05, "loss": 0.15642645359039306, "step": 2305 }, { "epoch": 0.4203056768558952, "grad_norm": 0.1584865301847458, "learning_rate": 3.2585280845606645e-05, "loss": 0.15490249395370484, "step": 2310 }, { "epoch": 0.42121542940320233, "grad_norm": 0.15893949568271637, "learning_rate": 3.251510989929052e-05, "loss": 0.1598116159439087, "step": 2315 }, { "epoch": 0.42212518195050946, "grad_norm": 0.18930596113204956, "learning_rate": 3.244487380424817e-05, "loss": 0.1482008934020996, "step": 2320 }, { "epoch": 0.4230349344978166, "grad_norm": 0.132876455783844, "learning_rate": 3.237457316935856e-05, "loss": 0.15304710865020751, "step": 2325 }, { "epoch": 0.4239446870451237, "grad_norm": 0.16447032988071442, "learning_rate": 3.2304208604060106e-05, "loss": 0.15298750400543212, "step": 2330 }, { "epoch": 0.42485443959243085, "grad_norm": 0.17748120427131653, "learning_rate": 3.223378071834546e-05, "loss": 0.1556084156036377, "step": 2335 }, { "epoch": 0.425764192139738, "grad_norm": 0.16366586089134216, "learning_rate": 3.2163290122756206e-05, "loss": 0.14387927055358887, "step": 2340 }, { "epoch": 0.4266739446870451, "grad_norm": 0.15398970246315002, "learning_rate": 3.209273742837755e-05, "loss": 0.16091293096542358, "step": 2345 }, { "epoch": 0.42758369723435224, "grad_norm": 0.164212167263031, "learning_rate": 3.202212324683305e-05, "loss": 0.15523531436920165, "step": 2350 }, { "epoch": 0.4284934497816594, "grad_norm": 0.16749800741672516, "learning_rate": 3.1951448190279255e-05, "loss": 0.15354975461959838, "step": 2355 }, { "epoch": 0.4294032023289665, "grad_norm": 0.14137034118175507, "learning_rate": 3.18807128714005e-05, "loss": 0.14981694221496583, "step": 2360 }, { "epoch": 0.43031295487627363, "grad_norm": 0.14848439395427704, "learning_rate": 3.1809917903403507e-05, "loss": 0.15448769330978393, "step": 2365 }, { "epoch": 0.43122270742358076, "grad_norm": 0.1747605800628662, "learning_rate": 3.1739063900012095e-05, "loss": 0.15882387161254882, "step": 2370 }, { "epoch": 0.4321324599708879, "grad_norm": 0.16054467856884003, "learning_rate": 3.166815147546186e-05, "loss": 0.15170297622680665, "step": 2375 }, { "epoch": 0.433042212518195, "grad_norm": 0.15428027510643005, "learning_rate": 3.1597181244494886e-05, "loss": 0.16202548742294312, "step": 2380 }, { "epoch": 0.4339519650655022, "grad_norm": 0.16747219860553741, "learning_rate": 3.1526153822354325e-05, "loss": 0.15461477041244506, "step": 2385 }, { "epoch": 0.43486171761280934, "grad_norm": 0.17415772378444672, "learning_rate": 3.145506982477918e-05, "loss": 0.16173542737960817, "step": 2390 }, { "epoch": 0.43577147016011647, "grad_norm": 0.1293518990278244, "learning_rate": 3.1383929867998865e-05, "loss": 0.15572521686553956, "step": 2395 }, { "epoch": 0.4366812227074236, "grad_norm": 0.16909323632717133, "learning_rate": 3.1312734568727935e-05, "loss": 0.15898628234863282, "step": 2400 }, { "epoch": 0.43759097525473073, "grad_norm": 0.16770294308662415, "learning_rate": 3.124148454416069e-05, "loss": 0.1536281704902649, "step": 2405 }, { "epoch": 0.43850072780203786, "grad_norm": 0.14078612625598907, "learning_rate": 3.117018041196585e-05, "loss": 0.15274266004562378, "step": 2410 }, { "epoch": 0.439410480349345, "grad_norm": 0.15457536280155182, "learning_rate": 3.1098822790281226e-05, "loss": 0.15391263961791993, "step": 2415 }, { "epoch": 0.4403202328966521, "grad_norm": 0.1640717089176178, "learning_rate": 3.102741229770827e-05, "loss": 0.15515168905258178, "step": 2420 }, { "epoch": 0.44122998544395925, "grad_norm": 0.2601533830165863, "learning_rate": 3.095594955330683e-05, "loss": 0.1587247371673584, "step": 2425 }, { "epoch": 0.4421397379912664, "grad_norm": 0.1352529525756836, "learning_rate": 3.08844351765897e-05, "loss": 0.1483217477798462, "step": 2430 }, { "epoch": 0.4430494905385735, "grad_norm": 0.18479721248149872, "learning_rate": 3.081286978751728e-05, "loss": 0.15121787786483765, "step": 2435 }, { "epoch": 0.44395924308588064, "grad_norm": 0.16954511404037476, "learning_rate": 3.074125400649221e-05, "loss": 0.16073100566864013, "step": 2440 }, { "epoch": 0.44486899563318777, "grad_norm": 0.15154729783535004, "learning_rate": 3.0669588454353944e-05, "loss": 0.15738017559051515, "step": 2445 }, { "epoch": 0.4457787481804949, "grad_norm": 0.1540488302707672, "learning_rate": 3.059787375237344e-05, "loss": 0.1515384554862976, "step": 2450 }, { "epoch": 0.44668850072780203, "grad_norm": 0.1814432442188263, "learning_rate": 3.052611052224774e-05, "loss": 0.15731438398361205, "step": 2455 }, { "epoch": 0.44759825327510916, "grad_norm": 0.16657036542892456, "learning_rate": 3.0454299386094542e-05, "loss": 0.15741543769836425, "step": 2460 }, { "epoch": 0.4485080058224163, "grad_norm": 0.2177237570285797, "learning_rate": 3.0382440966446875e-05, "loss": 0.14972515106201173, "step": 2465 }, { "epoch": 0.4494177583697234, "grad_norm": 0.1669909954071045, "learning_rate": 3.031053588624766e-05, "loss": 0.1506432294845581, "step": 2470 }, { "epoch": 0.45032751091703055, "grad_norm": 0.1752234250307083, "learning_rate": 3.0238584768844313e-05, "loss": 0.14969609975814818, "step": 2475 }, { "epoch": 0.4512372634643377, "grad_norm": 0.18267901241779327, "learning_rate": 3.0166588237983363e-05, "loss": 0.15112748146057128, "step": 2480 }, { "epoch": 0.4521470160116448, "grad_norm": 0.16250105202198029, "learning_rate": 3.0094546917805007e-05, "loss": 0.15864100456237792, "step": 2485 }, { "epoch": 0.45305676855895194, "grad_norm": 0.14825721085071564, "learning_rate": 3.0022461432837752e-05, "loss": 0.1513954520225525, "step": 2490 }, { "epoch": 0.4539665211062591, "grad_norm": 0.1626640111207962, "learning_rate": 2.9950332407992943e-05, "loss": 0.1505578875541687, "step": 2495 }, { "epoch": 0.45487627365356625, "grad_norm": 0.1535351574420929, "learning_rate": 2.987816046855939e-05, "loss": 0.15255829095840454, "step": 2500 }, { "epoch": 0.4557860262008734, "grad_norm": 0.17552775144577026, "learning_rate": 2.9805946240197928e-05, "loss": 0.1516443133354187, "step": 2505 }, { "epoch": 0.4566957787481805, "grad_norm": 0.16020981967449188, "learning_rate": 2.9733690348935994e-05, "loss": 0.14519743919372557, "step": 2510 }, { "epoch": 0.45760553129548764, "grad_norm": 0.17800211906433105, "learning_rate": 2.9661393421162204e-05, "loss": 0.15679080486297609, "step": 2515 }, { "epoch": 0.4585152838427948, "grad_norm": 0.16016991436481476, "learning_rate": 2.9589056083620902e-05, "loss": 0.14768127202987671, "step": 2520 }, { "epoch": 0.4594250363901019, "grad_norm": 0.16272081434726715, "learning_rate": 2.951667896340679e-05, "loss": 0.1513301968574524, "step": 2525 }, { "epoch": 0.46033478893740903, "grad_norm": 0.1726413071155548, "learning_rate": 2.9444262687959402e-05, "loss": 0.14819332361221313, "step": 2530 }, { "epoch": 0.46124454148471616, "grad_norm": 0.1670403778553009, "learning_rate": 2.9371807885057735e-05, "loss": 0.15245940685272216, "step": 2535 }, { "epoch": 0.4621542940320233, "grad_norm": 0.1650049239397049, "learning_rate": 2.9299315182814772e-05, "loss": 0.15187418460845947, "step": 2540 }, { "epoch": 0.4630640465793304, "grad_norm": 0.16327734291553497, "learning_rate": 2.9226785209672047e-05, "loss": 0.15579828023910522, "step": 2545 }, { "epoch": 0.46397379912663755, "grad_norm": 0.3367880582809448, "learning_rate": 2.91542185943942e-05, "loss": 0.15617697238922118, "step": 2550 }, { "epoch": 0.4648835516739447, "grad_norm": 0.1731594055891037, "learning_rate": 2.908161596606353e-05, "loss": 0.1559603691101074, "step": 2555 }, { "epoch": 0.4657933042212518, "grad_norm": 0.1477293074131012, "learning_rate": 2.9008977954074517e-05, "loss": 0.15567959547042848, "step": 2560 }, { "epoch": 0.46670305676855894, "grad_norm": 0.16227173805236816, "learning_rate": 2.8936305188128392e-05, "loss": 0.1522113561630249, "step": 2565 }, { "epoch": 0.4676128093158661, "grad_norm": 0.2031075656414032, "learning_rate": 2.8863598298227674e-05, "loss": 0.15054640769958497, "step": 2570 }, { "epoch": 0.4685225618631732, "grad_norm": 0.18351472914218903, "learning_rate": 2.8790857914670698e-05, "loss": 0.15837019681930542, "step": 2575 }, { "epoch": 0.46943231441048033, "grad_norm": 0.15914765000343323, "learning_rate": 2.871808466804616e-05, "loss": 0.1550259470939636, "step": 2580 }, { "epoch": 0.47034206695778746, "grad_norm": 0.17366717755794525, "learning_rate": 2.8645279189227636e-05, "loss": 0.15702390670776367, "step": 2585 }, { "epoch": 0.4712518195050946, "grad_norm": 0.13677838444709778, "learning_rate": 2.8572442109368134e-05, "loss": 0.15485031604766847, "step": 2590 }, { "epoch": 0.4721615720524017, "grad_norm": 0.1477748304605484, "learning_rate": 2.8499574059894617e-05, "loss": 0.14577245712280273, "step": 2595 }, { "epoch": 0.47307132459970885, "grad_norm": 0.1582217663526535, "learning_rate": 2.842667567250252e-05, "loss": 0.15586793422698975, "step": 2600 }, { "epoch": 0.47398107714701604, "grad_norm": 0.19658738374710083, "learning_rate": 2.8353747579150268e-05, "loss": 0.15060495138168334, "step": 2605 }, { "epoch": 0.47489082969432317, "grad_norm": 0.176767036318779, "learning_rate": 2.828079041205382e-05, "loss": 0.15116705894470214, "step": 2610 }, { "epoch": 0.4758005822416303, "grad_norm": 0.16972507536411285, "learning_rate": 2.820780480368117e-05, "loss": 0.1541937470436096, "step": 2615 }, { "epoch": 0.47671033478893743, "grad_norm": 0.1548585742712021, "learning_rate": 2.8134791386746884e-05, "loss": 0.14334756135940552, "step": 2620 }, { "epoch": 0.47762008733624456, "grad_norm": 0.15411986410617828, "learning_rate": 2.806175079420658e-05, "loss": 0.14642289876937867, "step": 2625 }, { "epoch": 0.4785298398835517, "grad_norm": 0.16609491407871246, "learning_rate": 2.7988683659251474e-05, "loss": 0.15083469152450563, "step": 2630 }, { "epoch": 0.4794395924308588, "grad_norm": 0.16592684388160706, "learning_rate": 2.791559061530289e-05, "loss": 0.14218480587005616, "step": 2635 }, { "epoch": 0.48034934497816595, "grad_norm": 0.1764935404062271, "learning_rate": 2.7842472296006722e-05, "loss": 0.15004343986511232, "step": 2640 }, { "epoch": 0.4812590975254731, "grad_norm": 0.20094354450702667, "learning_rate": 2.7769329335228022e-05, "loss": 0.14975016117095946, "step": 2645 }, { "epoch": 0.4821688500727802, "grad_norm": 0.1869269460439682, "learning_rate": 2.769616236704542e-05, "loss": 0.155981707572937, "step": 2650 }, { "epoch": 0.48307860262008734, "grad_norm": 0.16671574115753174, "learning_rate": 2.762297202574571e-05, "loss": 0.14633859395980836, "step": 2655 }, { "epoch": 0.48398835516739447, "grad_norm": 0.14999663829803467, "learning_rate": 2.754975894581826e-05, "loss": 0.15692603588104248, "step": 2660 }, { "epoch": 0.4848981077147016, "grad_norm": 0.16893649101257324, "learning_rate": 2.7476523761949592e-05, "loss": 0.14530394077301026, "step": 2665 }, { "epoch": 0.48580786026200873, "grad_norm": 0.16039884090423584, "learning_rate": 2.740326710901784e-05, "loss": 0.15013915300369263, "step": 2670 }, { "epoch": 0.48671761280931586, "grad_norm": 0.16672006249427795, "learning_rate": 2.732998962208725e-05, "loss": 0.15667349100112915, "step": 2675 }, { "epoch": 0.487627365356623, "grad_norm": 0.2160867303609848, "learning_rate": 2.7256691936402684e-05, "loss": 0.14335414171218872, "step": 2680 }, { "epoch": 0.4885371179039301, "grad_norm": 0.349030077457428, "learning_rate": 2.71833746873841e-05, "loss": 0.1437530279159546, "step": 2685 }, { "epoch": 0.48944687045123725, "grad_norm": 0.18380966782569885, "learning_rate": 2.7110038510621073e-05, "loss": 0.1476014256477356, "step": 2690 }, { "epoch": 0.4903566229985444, "grad_norm": 0.1523742377758026, "learning_rate": 2.703668404186722e-05, "loss": 0.14578526020050048, "step": 2695 }, { "epoch": 0.4912663755458515, "grad_norm": 0.16092729568481445, "learning_rate": 2.696331191703479e-05, "loss": 0.15335593223571778, "step": 2700 }, { "epoch": 0.49217612809315864, "grad_norm": 0.17185333371162415, "learning_rate": 2.688992277218904e-05, "loss": 0.1540898084640503, "step": 2705 }, { "epoch": 0.49308588064046577, "grad_norm": 0.1521969735622406, "learning_rate": 2.6816517243542792e-05, "loss": 0.15171396732330322, "step": 2710 }, { "epoch": 0.49399563318777295, "grad_norm": 0.16064171493053436, "learning_rate": 2.674309596745092e-05, "loss": 0.1505839228630066, "step": 2715 }, { "epoch": 0.4949053857350801, "grad_norm": 0.16430898010730743, "learning_rate": 2.6669659580404795e-05, "loss": 0.1551363468170166, "step": 2720 }, { "epoch": 0.4958151382823872, "grad_norm": 0.16125477850437164, "learning_rate": 2.659620871902677e-05, "loss": 0.15069286823272704, "step": 2725 }, { "epoch": 0.49672489082969434, "grad_norm": 0.1428450047969818, "learning_rate": 2.652274402006471e-05, "loss": 0.15511081218719483, "step": 2730 }, { "epoch": 0.4976346433770015, "grad_norm": 0.15452754497528076, "learning_rate": 2.6449266120386406e-05, "loss": 0.14941939115524291, "step": 2735 }, { "epoch": 0.4985443959243086, "grad_norm": 0.17243537306785583, "learning_rate": 2.6375775656974123e-05, "loss": 0.151741623878479, "step": 2740 }, { "epoch": 0.49945414847161573, "grad_norm": 0.13736453652381897, "learning_rate": 2.6302273266919008e-05, "loss": 0.147042977809906, "step": 2745 }, { "epoch": 0.5003639010189228, "grad_norm": 0.16241495311260223, "learning_rate": 2.6228759587415614e-05, "loss": 0.14664684534072875, "step": 2750 }, { "epoch": 0.50127365356623, "grad_norm": 0.193496435880661, "learning_rate": 2.6155235255756356e-05, "loss": 0.15486966371536254, "step": 2755 }, { "epoch": 0.5021834061135371, "grad_norm": 0.1542847901582718, "learning_rate": 2.6081700909326e-05, "loss": 0.15148009061813356, "step": 2760 }, { "epoch": 0.5030931586608443, "grad_norm": 0.1696511209011078, "learning_rate": 2.6008157185596142e-05, "loss": 0.14190055131912233, "step": 2765 }, { "epoch": 0.5040029112081513, "grad_norm": 0.14690077304840088, "learning_rate": 2.5934604722119655e-05, "loss": 0.1570739269256592, "step": 2770 }, { "epoch": 0.5049126637554585, "grad_norm": 0.17149671912193298, "learning_rate": 2.5861044156525162e-05, "loss": 0.14940304756164552, "step": 2775 }, { "epoch": 0.5058224163027657, "grad_norm": 0.16639231145381927, "learning_rate": 2.578747612651155e-05, "loss": 0.15691237449645995, "step": 2780 }, { "epoch": 0.5067321688500728, "grad_norm": 0.2062763124704361, "learning_rate": 2.5713901269842404e-05, "loss": 0.1564734935760498, "step": 2785 }, { "epoch": 0.50764192139738, "grad_norm": 0.12636308372020721, "learning_rate": 2.5640320224340502e-05, "loss": 0.14539417028427123, "step": 2790 }, { "epoch": 0.508551673944687, "grad_norm": 0.16893689334392548, "learning_rate": 2.556673362788225e-05, "loss": 0.15440930128097535, "step": 2795 }, { "epoch": 0.5094614264919942, "grad_norm": 0.16250015795230865, "learning_rate": 2.54931421183922e-05, "loss": 0.14485647678375244, "step": 2800 }, { "epoch": 0.5103711790393013, "grad_norm": 0.1700994372367859, "learning_rate": 2.5419546333837462e-05, "loss": 0.15411126613616943, "step": 2805 }, { "epoch": 0.5112809315866085, "grad_norm": 0.1547706127166748, "learning_rate": 2.5345946912222256e-05, "loss": 0.15516072511672974, "step": 2810 }, { "epoch": 0.5121906841339156, "grad_norm": 0.17955681681632996, "learning_rate": 2.527234449158228e-05, "loss": 0.15546923875808716, "step": 2815 }, { "epoch": 0.5131004366812227, "grad_norm": 0.163709819316864, "learning_rate": 2.519873970997927e-05, "loss": 0.15665037631988527, "step": 2820 }, { "epoch": 0.5140101892285298, "grad_norm": 0.17859576642513275, "learning_rate": 2.5125133205495405e-05, "loss": 0.1539722204208374, "step": 2825 }, { "epoch": 0.514919941775837, "grad_norm": 0.17443150281906128, "learning_rate": 2.5051525616227806e-05, "loss": 0.148411762714386, "step": 2830 }, { "epoch": 0.5158296943231441, "grad_norm": 0.17397581040859222, "learning_rate": 2.4977917580283007e-05, "loss": 0.14880497455596925, "step": 2835 }, { "epoch": 0.5167394468704513, "grad_norm": 0.14565663039684296, "learning_rate": 2.4904309735771405e-05, "loss": 0.14934680461883545, "step": 2840 }, { "epoch": 0.5176491994177583, "grad_norm": 0.17895659804344177, "learning_rate": 2.4830702720801746e-05, "loss": 0.15287939310073853, "step": 2845 }, { "epoch": 0.5185589519650655, "grad_norm": 0.15812788903713226, "learning_rate": 2.4757097173475572e-05, "loss": 0.14576947689056396, "step": 2850 }, { "epoch": 0.5194687045123726, "grad_norm": 0.17123781144618988, "learning_rate": 2.46834937318817e-05, "loss": 0.15224847793579102, "step": 2855 }, { "epoch": 0.5203784570596798, "grad_norm": 0.14845474064350128, "learning_rate": 2.460989303409072e-05, "loss": 0.14901585578918458, "step": 2860 }, { "epoch": 0.5212882096069869, "grad_norm": 0.23493704199790955, "learning_rate": 2.4536295718149407e-05, "loss": 0.1517487049102783, "step": 2865 }, { "epoch": 0.522197962154294, "grad_norm": 0.16209843754768372, "learning_rate": 2.4462702422075217e-05, "loss": 0.14327445030212402, "step": 2870 }, { "epoch": 0.5231077147016011, "grad_norm": 0.17249803245067596, "learning_rate": 2.4389113783850793e-05, "loss": 0.1517549753189087, "step": 2875 }, { "epoch": 0.5240174672489083, "grad_norm": 0.14561402797698975, "learning_rate": 2.431553044141836e-05, "loss": 0.14764087200164794, "step": 2880 }, { "epoch": 0.5249272197962155, "grad_norm": 0.17033302783966064, "learning_rate": 2.4241953032674256e-05, "loss": 0.15181604623794556, "step": 2885 }, { "epoch": 0.5258369723435226, "grad_norm": 0.1184430941939354, "learning_rate": 2.4168382195463367e-05, "loss": 0.14264242649078368, "step": 2890 }, { "epoch": 0.5267467248908297, "grad_norm": 0.17521196603775024, "learning_rate": 2.4094818567573618e-05, "loss": 0.1509538173675537, "step": 2895 }, { "epoch": 0.5276564774381368, "grad_norm": 0.1681576371192932, "learning_rate": 2.4021262786730428e-05, "loss": 0.15344605445861817, "step": 2900 }, { "epoch": 0.528566229985444, "grad_norm": 0.17134182155132294, "learning_rate": 2.3947715490591206e-05, "loss": 0.15161689519882202, "step": 2905 }, { "epoch": 0.5294759825327511, "grad_norm": 0.1796472817659378, "learning_rate": 2.3874177316739778e-05, "loss": 0.15086464881896972, "step": 2910 }, { "epoch": 0.5303857350800583, "grad_norm": 0.23268625140190125, "learning_rate": 2.380064890268093e-05, "loss": 0.15354180335998535, "step": 2915 }, { "epoch": 0.5312954876273653, "grad_norm": 0.16318941116333008, "learning_rate": 2.372713088583481e-05, "loss": 0.15131797790527343, "step": 2920 }, { "epoch": 0.5322052401746725, "grad_norm": 0.18171803653240204, "learning_rate": 2.365362390353143e-05, "loss": 0.15784090757369995, "step": 2925 }, { "epoch": 0.5331149927219796, "grad_norm": 0.17672640085220337, "learning_rate": 2.3580128593005156e-05, "loss": 0.15509436130523682, "step": 2930 }, { "epoch": 0.5340247452692868, "grad_norm": 0.15985223650932312, "learning_rate": 2.3506645591389174e-05, "loss": 0.14851027727127075, "step": 2935 }, { "epoch": 0.5349344978165939, "grad_norm": 0.16597607731819153, "learning_rate": 2.343317553570995e-05, "loss": 0.1504931092262268, "step": 2940 }, { "epoch": 0.535844250363901, "grad_norm": 0.20180748403072357, "learning_rate": 2.3359719062881725e-05, "loss": 0.15023820400238036, "step": 2945 }, { "epoch": 0.5367540029112081, "grad_norm": 0.1735963076353073, "learning_rate": 2.3286276809701e-05, "loss": 0.15374408960342406, "step": 2950 }, { "epoch": 0.5376637554585153, "grad_norm": 0.17629501223564148, "learning_rate": 2.3212849412840995e-05, "loss": 0.15007833242416382, "step": 2955 }, { "epoch": 0.5385735080058224, "grad_norm": 0.1493796557188034, "learning_rate": 2.3139437508846155e-05, "loss": 0.15206656455993653, "step": 2960 }, { "epoch": 0.5394832605531296, "grad_norm": 0.17426837980747223, "learning_rate": 2.306604173412659e-05, "loss": 0.1441131591796875, "step": 2965 }, { "epoch": 0.5403930131004366, "grad_norm": 0.16984431445598602, "learning_rate": 2.2992662724952613e-05, "loss": 0.14438753128051757, "step": 2970 }, { "epoch": 0.5413027656477438, "grad_norm": 0.1814386397600174, "learning_rate": 2.2919301117449167e-05, "loss": 0.14887022972106934, "step": 2975 }, { "epoch": 0.5422125181950509, "grad_norm": 0.158392995595932, "learning_rate": 2.2845957547590368e-05, "loss": 0.14404361248016356, "step": 2980 }, { "epoch": 0.5431222707423581, "grad_norm": 0.17496263980865479, "learning_rate": 2.2772632651193953e-05, "loss": 0.1454906702041626, "step": 2985 }, { "epoch": 0.5440320232896652, "grad_norm": 0.157533198595047, "learning_rate": 2.2699327063915766e-05, "loss": 0.1458217740058899, "step": 2990 }, { "epoch": 0.5449417758369723, "grad_norm": 0.1767890453338623, "learning_rate": 2.262604142124427e-05, "loss": 0.14384825229644777, "step": 2995 }, { "epoch": 0.5458515283842795, "grad_norm": 0.1851050704717636, "learning_rate": 2.2552776358495033e-05, "loss": 0.14832457304000854, "step": 3000 }, { "epoch": 0.5467612809315866, "grad_norm": 0.164175882935524, "learning_rate": 2.247953251080521e-05, "loss": 0.14999878406524658, "step": 3005 }, { "epoch": 0.5476710334788938, "grad_norm": 0.3403675854206085, "learning_rate": 2.240631051312804e-05, "loss": 0.1443937063217163, "step": 3010 }, { "epoch": 0.5485807860262009, "grad_norm": 0.16751109063625336, "learning_rate": 2.2333111000227342e-05, "loss": 0.1462402105331421, "step": 3015 }, { "epoch": 0.549490538573508, "grad_norm": 0.14741151034832, "learning_rate": 2.225993460667201e-05, "loss": 0.149855899810791, "step": 3020 }, { "epoch": 0.5504002911208151, "grad_norm": 0.20605266094207764, "learning_rate": 2.218678196683054e-05, "loss": 0.15413178205490113, "step": 3025 }, { "epoch": 0.5513100436681223, "grad_norm": 0.14884796738624573, "learning_rate": 2.2113653714865473e-05, "loss": 0.14592334032058715, "step": 3030 }, { "epoch": 0.5522197962154294, "grad_norm": 0.17114350199699402, "learning_rate": 2.2040550484727943e-05, "loss": 0.1498338460922241, "step": 3035 }, { "epoch": 0.5531295487627366, "grad_norm": 0.16496853530406952, "learning_rate": 2.196747291015219e-05, "loss": 0.14650315046310425, "step": 3040 }, { "epoch": 0.5540393013100436, "grad_norm": 0.15172401070594788, "learning_rate": 2.189442162465001e-05, "loss": 0.14984124898910522, "step": 3045 }, { "epoch": 0.5549490538573508, "grad_norm": 0.19258467853069305, "learning_rate": 2.182139726150532e-05, "loss": 0.1486764669418335, "step": 3050 }, { "epoch": 0.5558588064046579, "grad_norm": 0.1749001443386078, "learning_rate": 2.1748400453768652e-05, "loss": 0.14983701705932617, "step": 3055 }, { "epoch": 0.5567685589519651, "grad_norm": 0.37510567903518677, "learning_rate": 2.1675431834251637e-05, "loss": 0.14483561515808105, "step": 3060 }, { "epoch": 0.5576783114992722, "grad_norm": 0.16932405531406403, "learning_rate": 2.1602492035521553e-05, "loss": 0.14487643241882325, "step": 3065 }, { "epoch": 0.5585880640465793, "grad_norm": 0.174176424741745, "learning_rate": 2.152958168989584e-05, "loss": 0.14737497568130492, "step": 3070 }, { "epoch": 0.5594978165938864, "grad_norm": 0.1601252257823944, "learning_rate": 2.1456701429436577e-05, "loss": 0.15183379650115966, "step": 3075 }, { "epoch": 0.5604075691411936, "grad_norm": 0.14960910379886627, "learning_rate": 2.1383851885945085e-05, "loss": 0.143074893951416, "step": 3080 }, { "epoch": 0.5613173216885007, "grad_norm": 0.1678633838891983, "learning_rate": 2.1311033690956346e-05, "loss": 0.14961432218551635, "step": 3085 }, { "epoch": 0.5622270742358079, "grad_norm": 0.15814319252967834, "learning_rate": 2.1238247475733613e-05, "loss": 0.14308581352233887, "step": 3090 }, { "epoch": 0.5631368267831149, "grad_norm": 0.21240772306919098, "learning_rate": 2.1165493871262887e-05, "loss": 0.14737485647201537, "step": 3095 }, { "epoch": 0.5640465793304221, "grad_norm": 0.15161271393299103, "learning_rate": 2.109277350824749e-05, "loss": 0.14534420967102052, "step": 3100 }, { "epoch": 0.5649563318777293, "grad_norm": 0.16572362184524536, "learning_rate": 2.1020087017102537e-05, "loss": 0.14299670457839966, "step": 3105 }, { "epoch": 0.5658660844250364, "grad_norm": 0.1548164039850235, "learning_rate": 2.094743502794954e-05, "loss": 0.14371142387390137, "step": 3110 }, { "epoch": 0.5667758369723436, "grad_norm": 0.2574169933795929, "learning_rate": 2.0874818170610885e-05, "loss": 0.14350423812866211, "step": 3115 }, { "epoch": 0.5676855895196506, "grad_norm": 0.16359548270702362, "learning_rate": 2.080223707460443e-05, "loss": 0.1520243763923645, "step": 3120 }, { "epoch": 0.5685953420669578, "grad_norm": 0.1798320859670639, "learning_rate": 2.072969236913799e-05, "loss": 0.14832595586776734, "step": 3125 }, { "epoch": 0.5695050946142649, "grad_norm": 0.17045916616916656, "learning_rate": 2.0657184683103926e-05, "loss": 0.15308042764663696, "step": 3130 }, { "epoch": 0.5704148471615721, "grad_norm": 0.16345897316932678, "learning_rate": 2.058471464507366e-05, "loss": 0.14564799070358275, "step": 3135 }, { "epoch": 0.5713245997088792, "grad_norm": 0.15170110762119293, "learning_rate": 2.0512282883292257e-05, "loss": 0.14161767959594726, "step": 3140 }, { "epoch": 0.5722343522561864, "grad_norm": 0.8107472658157349, "learning_rate": 2.0439890025672955e-05, "loss": 0.14481087923049926, "step": 3145 }, { "epoch": 0.5731441048034934, "grad_norm": 0.15346679091453552, "learning_rate": 2.036753669979174e-05, "loss": 0.14860262870788574, "step": 3150 }, { "epoch": 0.5740538573508006, "grad_norm": 0.1632593423128128, "learning_rate": 2.0295223532881886e-05, "loss": 0.1481687307357788, "step": 3155 }, { "epoch": 0.5749636098981077, "grad_norm": 0.23399172723293304, "learning_rate": 2.022295115182852e-05, "loss": 0.149153733253479, "step": 3160 }, { "epoch": 0.5758733624454149, "grad_norm": 0.14977394044399261, "learning_rate": 2.015072018316323e-05, "loss": 0.14921388626098633, "step": 3165 }, { "epoch": 0.576783114992722, "grad_norm": 0.1550658792257309, "learning_rate": 2.007853125305856e-05, "loss": 0.1482759475708008, "step": 3170 }, { "epoch": 0.5776928675400291, "grad_norm": 0.16661737859249115, "learning_rate": 2.0006384987322645e-05, "loss": 0.14903552532196046, "step": 3175 }, { "epoch": 0.5786026200873362, "grad_norm": 0.1746823936700821, "learning_rate": 1.9934282011393753e-05, "loss": 0.1412947654724121, "step": 3180 }, { "epoch": 0.5795123726346434, "grad_norm": 0.17025792598724365, "learning_rate": 1.9862222950334857e-05, "loss": 0.15289769172668458, "step": 3185 }, { "epoch": 0.5804221251819505, "grad_norm": 0.16857658326625824, "learning_rate": 1.9790208428828252e-05, "loss": 0.14419941902160643, "step": 3190 }, { "epoch": 0.5813318777292577, "grad_norm": 0.16099876165390015, "learning_rate": 1.9718239071170118e-05, "loss": 0.14476487636566163, "step": 3195 }, { "epoch": 0.5822416302765647, "grad_norm": 0.16140873730182648, "learning_rate": 1.964631550126508e-05, "loss": 0.14588416814804078, "step": 3200 }, { "epoch": 0.5831513828238719, "grad_norm": 0.15719448029994965, "learning_rate": 1.957443834262087e-05, "loss": 0.15144693851470947, "step": 3205 }, { "epoch": 0.584061135371179, "grad_norm": 0.16512645781040192, "learning_rate": 1.950260821834285e-05, "loss": 0.14787566661834717, "step": 3210 }, { "epoch": 0.5849708879184862, "grad_norm": 0.18584516644477844, "learning_rate": 1.9430825751128643e-05, "loss": 0.14514710903167724, "step": 3215 }, { "epoch": 0.5858806404657934, "grad_norm": 0.17640981078147888, "learning_rate": 1.9359091563262742e-05, "loss": 0.1511004686355591, "step": 3220 }, { "epoch": 0.5867903930131004, "grad_norm": 0.1697624921798706, "learning_rate": 1.9287406276611095e-05, "loss": 0.15392563343048096, "step": 3225 }, { "epoch": 0.5877001455604076, "grad_norm": 0.1677260845899582, "learning_rate": 1.9215770512615725e-05, "loss": 0.15311745405197144, "step": 3230 }, { "epoch": 0.5886098981077147, "grad_norm": 0.15357480943202972, "learning_rate": 1.9144184892289337e-05, "loss": 0.14370160102844237, "step": 3235 }, { "epoch": 0.5895196506550219, "grad_norm": 0.18601207435131073, "learning_rate": 1.9072650036209955e-05, "loss": 0.14095077514648438, "step": 3240 }, { "epoch": 0.590429403202329, "grad_norm": 0.17313526570796967, "learning_rate": 1.9001166564515513e-05, "loss": 0.148259174823761, "step": 3245 }, { "epoch": 0.5913391557496361, "grad_norm": 0.1634378433227539, "learning_rate": 1.8929735096898504e-05, "loss": 0.15082294940948487, "step": 3250 }, { "epoch": 0.5922489082969432, "grad_norm": 0.18542174994945526, "learning_rate": 1.885835625260058e-05, "loss": 0.14461435079574586, "step": 3255 }, { "epoch": 0.5931586608442504, "grad_norm": 0.1740756630897522, "learning_rate": 1.87870306504072e-05, "loss": 0.14083608388900756, "step": 3260 }, { "epoch": 0.5940684133915575, "grad_norm": 0.25606217980384827, "learning_rate": 1.8715758908642288e-05, "loss": 0.15125386714935302, "step": 3265 }, { "epoch": 0.5949781659388647, "grad_norm": 0.20194627344608307, "learning_rate": 1.8644541645162834e-05, "loss": 0.14433003664016725, "step": 3270 }, { "epoch": 0.5958879184861717, "grad_norm": 0.1902168095111847, "learning_rate": 1.8573379477353542e-05, "loss": 0.14718132019042968, "step": 3275 }, { "epoch": 0.5967976710334789, "grad_norm": 0.15122972428798676, "learning_rate": 1.850227302212151e-05, "loss": 0.153376567363739, "step": 3280 }, { "epoch": 0.597707423580786, "grad_norm": 0.14331959187984467, "learning_rate": 1.843122289589085e-05, "loss": 0.146630597114563, "step": 3285 }, { "epoch": 0.5986171761280932, "grad_norm": 0.15083099901676178, "learning_rate": 1.836022971459737e-05, "loss": 0.1445971965789795, "step": 3290 }, { "epoch": 0.5995269286754003, "grad_norm": 0.16585418581962585, "learning_rate": 1.828929409368321e-05, "loss": 0.15120241641998292, "step": 3295 }, { "epoch": 0.6004366812227074, "grad_norm": 0.1653224229812622, "learning_rate": 1.8218416648091524e-05, "loss": 0.14349838495254516, "step": 3300 }, { "epoch": 0.6013464337700145, "grad_norm": 0.1891375184059143, "learning_rate": 1.8147597992261124e-05, "loss": 0.15171384811401367, "step": 3305 }, { "epoch": 0.6022561863173217, "grad_norm": 0.13392704725265503, "learning_rate": 1.8076838740121187e-05, "loss": 0.14607118368148803, "step": 3310 }, { "epoch": 0.6031659388646288, "grad_norm": 0.15421944856643677, "learning_rate": 1.8006139505085926e-05, "loss": 0.1380957007408142, "step": 3315 }, { "epoch": 0.604075691411936, "grad_norm": 0.16637761890888214, "learning_rate": 1.7935500900049246e-05, "loss": 0.14604611396789552, "step": 3320 }, { "epoch": 0.6049854439592431, "grad_norm": 0.16638441383838654, "learning_rate": 1.7864923537379445e-05, "loss": 0.1513611912727356, "step": 3325 }, { "epoch": 0.6058951965065502, "grad_norm": 0.1745707094669342, "learning_rate": 1.779440802891394e-05, "loss": 0.15391240119934083, "step": 3330 }, { "epoch": 0.6068049490538574, "grad_norm": 0.1620505005121231, "learning_rate": 1.77239549859539e-05, "loss": 0.14986472129821776, "step": 3335 }, { "epoch": 0.6077147016011645, "grad_norm": 0.1579132080078125, "learning_rate": 1.7653565019259e-05, "loss": 0.1466603994369507, "step": 3340 }, { "epoch": 0.6086244541484717, "grad_norm": 0.19154994189739227, "learning_rate": 1.7583238739042086e-05, "loss": 0.15228934288024903, "step": 3345 }, { "epoch": 0.6095342066957787, "grad_norm": 0.15771779417991638, "learning_rate": 1.7512976754963913e-05, "loss": 0.14965078830718995, "step": 3350 }, { "epoch": 0.6104439592430859, "grad_norm": 0.18406136333942413, "learning_rate": 1.744277967612785e-05, "loss": 0.1473196864128113, "step": 3355 }, { "epoch": 0.611353711790393, "grad_norm": 0.17603816092014313, "learning_rate": 1.7372648111074607e-05, "loss": 0.1430676221847534, "step": 3360 }, { "epoch": 0.6122634643377002, "grad_norm": 0.156408429145813, "learning_rate": 1.7302582667776933e-05, "loss": 0.14018454551696777, "step": 3365 }, { "epoch": 0.6131732168850073, "grad_norm": 0.14504843950271606, "learning_rate": 1.7232583953634407e-05, "loss": 0.14505640268325806, "step": 3370 }, { "epoch": 0.6140829694323144, "grad_norm": 0.1864968240261078, "learning_rate": 1.716265257546808e-05, "loss": 0.14810394048690795, "step": 3375 }, { "epoch": 0.6149927219796215, "grad_norm": 0.1621711403131485, "learning_rate": 1.7092789139515295e-05, "loss": 0.14203091859817504, "step": 3380 }, { "epoch": 0.6159024745269287, "grad_norm": 0.17994914948940277, "learning_rate": 1.70229942514244e-05, "loss": 0.14565644264221192, "step": 3385 }, { "epoch": 0.6168122270742358, "grad_norm": 0.1707388162612915, "learning_rate": 1.6953268516249486e-05, "loss": 0.14449434280395507, "step": 3390 }, { "epoch": 0.617721979621543, "grad_norm": 0.16425329446792603, "learning_rate": 1.6883612538445175e-05, "loss": 0.15185940265655518, "step": 3395 }, { "epoch": 0.61863173216885, "grad_norm": 0.15987788140773773, "learning_rate": 1.6814026921861335e-05, "loss": 0.14994431734085084, "step": 3400 }, { "epoch": 0.6195414847161572, "grad_norm": 0.2987690269947052, "learning_rate": 1.6744512269737894e-05, "loss": 0.14652738571166993, "step": 3405 }, { "epoch": 0.6204512372634643, "grad_norm": 0.1681315004825592, "learning_rate": 1.6675069184699574e-05, "loss": 0.14566165208816528, "step": 3410 }, { "epoch": 0.6213609898107715, "grad_norm": 0.15847846865653992, "learning_rate": 1.660569826875069e-05, "loss": 0.1374401330947876, "step": 3415 }, { "epoch": 0.6222707423580786, "grad_norm": 0.16370312869548798, "learning_rate": 1.6536400123269907e-05, "loss": 0.14905524253845215, "step": 3420 }, { "epoch": 0.6231804949053857, "grad_norm": 0.16054444015026093, "learning_rate": 1.6467175349005054e-05, "loss": 0.1496324896812439, "step": 3425 }, { "epoch": 0.6240902474526928, "grad_norm": 0.1663951277732849, "learning_rate": 1.639802454606788e-05, "loss": 0.1504170298576355, "step": 3430 }, { "epoch": 0.625, "grad_norm": 0.1591310054063797, "learning_rate": 1.6328948313928906e-05, "loss": 0.1410186171531677, "step": 3435 }, { "epoch": 0.6259097525473072, "grad_norm": 0.1637524962425232, "learning_rate": 1.6259947251412178e-05, "loss": 0.13963305950164795, "step": 3440 }, { "epoch": 0.6268195050946143, "grad_norm": 0.1688017100095749, "learning_rate": 1.6191021956690096e-05, "loss": 0.14727941751480103, "step": 3445 }, { "epoch": 0.6277292576419214, "grad_norm": 0.1691795438528061, "learning_rate": 1.612217302727821e-05, "loss": 0.14856183528900146, "step": 3450 }, { "epoch": 0.6286390101892285, "grad_norm": 0.18501746654510498, "learning_rate": 1.60534010600301e-05, "loss": 0.1481746554374695, "step": 3455 }, { "epoch": 0.6295487627365357, "grad_norm": 0.16234716773033142, "learning_rate": 1.5984706651132125e-05, "loss": 0.1427530527114868, "step": 3460 }, { "epoch": 0.6304585152838428, "grad_norm": 0.16013780236244202, "learning_rate": 1.5916090396098293e-05, "loss": 0.14264426231384278, "step": 3465 }, { "epoch": 0.63136826783115, "grad_norm": 0.17116396129131317, "learning_rate": 1.5847552889765095e-05, "loss": 0.14109257459640503, "step": 3470 }, { "epoch": 0.632278020378457, "grad_norm": 0.16949769854545593, "learning_rate": 1.5779094726286344e-05, "loss": 0.1387040376663208, "step": 3475 }, { "epoch": 0.6331877729257642, "grad_norm": 0.14983431994915009, "learning_rate": 1.5710716499128044e-05, "loss": 0.13645120859146118, "step": 3480 }, { "epoch": 0.6340975254730713, "grad_norm": 0.1632554531097412, "learning_rate": 1.564241880106321e-05, "loss": 0.14883992671966553, "step": 3485 }, { "epoch": 0.6350072780203785, "grad_norm": 0.15686506032943726, "learning_rate": 1.5574202224166744e-05, "loss": 0.14244272708892822, "step": 3490 }, { "epoch": 0.6359170305676856, "grad_norm": 0.18843458592891693, "learning_rate": 1.5506067359810333e-05, "loss": 0.15149861574172974, "step": 3495 }, { "epoch": 0.6368267831149927, "grad_norm": 0.15874551236629486, "learning_rate": 1.5438014798657275e-05, "loss": 0.15188233852386473, "step": 3500 }, { "epoch": 0.6377365356622998, "grad_norm": 0.17014239728450775, "learning_rate": 1.5370045130657366e-05, "loss": 0.14694437980651856, "step": 3505 }, { "epoch": 0.638646288209607, "grad_norm": 0.14744038879871368, "learning_rate": 1.5302158945041838e-05, "loss": 0.14434736967086792, "step": 3510 }, { "epoch": 0.6395560407569141, "grad_norm": 0.2069770246744156, "learning_rate": 1.523435683031818e-05, "loss": 0.13982917070388795, "step": 3515 }, { "epoch": 0.6404657933042213, "grad_norm": 0.17811502516269684, "learning_rate": 1.5166639374265063e-05, "loss": 0.1408839702606201, "step": 3520 }, { "epoch": 0.6413755458515283, "grad_norm": 0.165786474943161, "learning_rate": 1.509900716392728e-05, "loss": 0.15312877893447877, "step": 3525 }, { "epoch": 0.6422852983988355, "grad_norm": 0.1633884161710739, "learning_rate": 1.5031460785610596e-05, "loss": 0.1488795518875122, "step": 3530 }, { "epoch": 0.6431950509461426, "grad_norm": 0.16498984396457672, "learning_rate": 1.4964000824876723e-05, "loss": 0.15031465291976928, "step": 3535 }, { "epoch": 0.6441048034934498, "grad_norm": 0.18043678998947144, "learning_rate": 1.4896627866538191e-05, "loss": 0.147829806804657, "step": 3540 }, { "epoch": 0.6450145560407569, "grad_norm": 0.16813597083091736, "learning_rate": 1.4829342494653315e-05, "loss": 0.1418998956680298, "step": 3545 }, { "epoch": 0.645924308588064, "grad_norm": 0.1817242056131363, "learning_rate": 1.4762145292521118e-05, "loss": 0.14508869647979736, "step": 3550 }, { "epoch": 0.6468340611353712, "grad_norm": 0.14666494727134705, "learning_rate": 1.469503684267628e-05, "loss": 0.14159854650497436, "step": 3555 }, { "epoch": 0.6477438136826783, "grad_norm": 0.16485381126403809, "learning_rate": 1.4628017726884086e-05, "loss": 0.14419105052947997, "step": 3560 }, { "epoch": 0.6486535662299855, "grad_norm": 0.16100342571735382, "learning_rate": 1.4561088526135375e-05, "loss": 0.14501721858978273, "step": 3565 }, { "epoch": 0.6495633187772926, "grad_norm": 0.16996590793132782, "learning_rate": 1.4494249820641493e-05, "loss": 0.1377166509628296, "step": 3570 }, { "epoch": 0.6504730713245997, "grad_norm": 0.16168837249279022, "learning_rate": 1.4427502189829339e-05, "loss": 0.1414325475692749, "step": 3575 }, { "epoch": 0.6513828238719068, "grad_norm": 0.16318906843662262, "learning_rate": 1.436084621233621e-05, "loss": 0.14685193300247193, "step": 3580 }, { "epoch": 0.652292576419214, "grad_norm": 0.1636219322681427, "learning_rate": 1.4294282466004899e-05, "loss": 0.1405899167060852, "step": 3585 }, { "epoch": 0.6532023289665211, "grad_norm": 0.1838461309671402, "learning_rate": 1.422781152787865e-05, "loss": 0.14386332035064697, "step": 3590 }, { "epoch": 0.6541120815138283, "grad_norm": 0.1796344667673111, "learning_rate": 1.4161433974196115e-05, "loss": 0.1513024687767029, "step": 3595 }, { "epoch": 0.6550218340611353, "grad_norm": 0.16424529254436493, "learning_rate": 1.4095150380386427e-05, "loss": 0.14238927364349366, "step": 3600 }, { "epoch": 0.6559315866084425, "grad_norm": 0.19264160096645355, "learning_rate": 1.402896132106415e-05, "loss": 0.14297477006912232, "step": 3605 }, { "epoch": 0.6568413391557496, "grad_norm": 0.18319948017597198, "learning_rate": 1.3962867370024347e-05, "loss": 0.1448880434036255, "step": 3610 }, { "epoch": 0.6577510917030568, "grad_norm": 0.16507290303707123, "learning_rate": 1.389686910023758e-05, "loss": 0.14724698066711425, "step": 3615 }, { "epoch": 0.6586608442503639, "grad_norm": 0.17871244251728058, "learning_rate": 1.3830967083844942e-05, "loss": 0.14479386806488037, "step": 3620 }, { "epoch": 0.659570596797671, "grad_norm": 0.1846228390932083, "learning_rate": 1.3765161892153112e-05, "loss": 0.1453616738319397, "step": 3625 }, { "epoch": 0.6604803493449781, "grad_norm": 0.17185978591442108, "learning_rate": 1.3699454095629372e-05, "loss": 0.14906206130981445, "step": 3630 }, { "epoch": 0.6613901018922853, "grad_norm": 0.14751191437244415, "learning_rate": 1.3633844263896698e-05, "loss": 0.13991892337799072, "step": 3635 }, { "epoch": 0.6622998544395924, "grad_norm": 0.22059763967990875, "learning_rate": 1.3568332965728817e-05, "loss": 0.14680869579315187, "step": 3640 }, { "epoch": 0.6632096069868996, "grad_norm": 0.15295909345149994, "learning_rate": 1.3502920769045232e-05, "loss": 0.1404443383216858, "step": 3645 }, { "epoch": 0.6641193595342066, "grad_norm": 0.14600558578968048, "learning_rate": 1.3437608240906364e-05, "loss": 0.14663270711898804, "step": 3650 }, { "epoch": 0.6650291120815138, "grad_norm": 0.15548352897167206, "learning_rate": 1.3372395947508587e-05, "loss": 0.1431443452835083, "step": 3655 }, { "epoch": 0.665938864628821, "grad_norm": 0.1813388466835022, "learning_rate": 1.3307284454179342e-05, "loss": 0.1458706736564636, "step": 3660 }, { "epoch": 0.6668486171761281, "grad_norm": 0.16326870024204254, "learning_rate": 1.3242274325372247e-05, "loss": 0.14700595140457154, "step": 3665 }, { "epoch": 0.6677583697234353, "grad_norm": 0.18779197335243225, "learning_rate": 1.3177366124662149e-05, "loss": 0.1497237801551819, "step": 3670 }, { "epoch": 0.6686681222707423, "grad_norm": 0.16291002929210663, "learning_rate": 1.3112560414740315e-05, "loss": 0.1387086868286133, "step": 3675 }, { "epoch": 0.6695778748180495, "grad_norm": 0.1532297134399414, "learning_rate": 1.3047857757409487e-05, "loss": 0.14497545957565308, "step": 3680 }, { "epoch": 0.6704876273653566, "grad_norm": 0.14697515964508057, "learning_rate": 1.2983258713579066e-05, "loss": 0.1494283437728882, "step": 3685 }, { "epoch": 0.6713973799126638, "grad_norm": 0.15213452279567719, "learning_rate": 1.2918763843260218e-05, "loss": 0.1468907594680786, "step": 3690 }, { "epoch": 0.6723071324599709, "grad_norm": 0.1745215803384781, "learning_rate": 1.285437370556099e-05, "loss": 0.14997754096984864, "step": 3695 }, { "epoch": 0.673216885007278, "grad_norm": 0.19207637012004852, "learning_rate": 1.2790088858681577e-05, "loss": 0.14202862977981567, "step": 3700 }, { "epoch": 0.6741266375545851, "grad_norm": 0.1521359086036682, "learning_rate": 1.2725909859909313e-05, "loss": 0.14547673463821412, "step": 3705 }, { "epoch": 0.6750363901018923, "grad_norm": 0.16975535452365875, "learning_rate": 1.2661837265613999e-05, "loss": 0.14006874561309815, "step": 3710 }, { "epoch": 0.6759461426491994, "grad_norm": 0.22234582901000977, "learning_rate": 1.2597871631242992e-05, "loss": 0.13691173791885375, "step": 3715 }, { "epoch": 0.6768558951965066, "grad_norm": 0.16082969307899475, "learning_rate": 1.2534013511316383e-05, "loss": 0.14932308197021485, "step": 3720 }, { "epoch": 0.6777656477438136, "grad_norm": 0.1751091182231903, "learning_rate": 1.247026345942226e-05, "loss": 0.14531974792480468, "step": 3725 }, { "epoch": 0.6786754002911208, "grad_norm": 0.15838147699832916, "learning_rate": 1.2406622028211844e-05, "loss": 0.14759832620620728, "step": 3730 }, { "epoch": 0.6795851528384279, "grad_norm": 0.1771744042634964, "learning_rate": 1.2343089769394714e-05, "loss": 0.1382831573486328, "step": 3735 }, { "epoch": 0.6804949053857351, "grad_norm": 0.16301538050174713, "learning_rate": 1.2279667233734037e-05, "loss": 0.14444775581359864, "step": 3740 }, { "epoch": 0.6814046579330422, "grad_norm": 0.1584121286869049, "learning_rate": 1.2216354971041796e-05, "loss": 0.14200170040130616, "step": 3745 }, { "epoch": 0.6823144104803494, "grad_norm": 0.139187291264534, "learning_rate": 1.2153153530174007e-05, "loss": 0.14318310022354125, "step": 3750 }, { "epoch": 0.6832241630276564, "grad_norm": 0.13665248453617096, "learning_rate": 1.2090063459025955e-05, "loss": 0.1411946654319763, "step": 3755 }, { "epoch": 0.6841339155749636, "grad_norm": 0.16273781657218933, "learning_rate": 1.2027085304527475e-05, "loss": 0.14873508214950562, "step": 3760 }, { "epoch": 0.6850436681222707, "grad_norm": 0.16317526996135712, "learning_rate": 1.1964219612638194e-05, "loss": 0.14644203186035157, "step": 3765 }, { "epoch": 0.6859534206695779, "grad_norm": 0.17253617942333221, "learning_rate": 1.1901466928342777e-05, "loss": 0.14027841091156007, "step": 3770 }, { "epoch": 0.6868631732168851, "grad_norm": 0.19692830741405487, "learning_rate": 1.183882779564624e-05, "loss": 0.14411110877990724, "step": 3775 }, { "epoch": 0.6877729257641921, "grad_norm": 0.15444578230381012, "learning_rate": 1.1776302757569214e-05, "loss": 0.14355008602142333, "step": 3780 }, { "epoch": 0.6886826783114993, "grad_norm": 0.1622200757265091, "learning_rate": 1.1713892356143239e-05, "loss": 0.14794334173202514, "step": 3785 }, { "epoch": 0.6895924308588064, "grad_norm": 0.1898501068353653, "learning_rate": 1.1651597132406073e-05, "loss": 0.1418622612953186, "step": 3790 }, { "epoch": 0.6905021834061136, "grad_norm": 0.17803208529949188, "learning_rate": 1.1589417626396973e-05, "loss": 0.14576040506362914, "step": 3795 }, { "epoch": 0.6914119359534207, "grad_norm": 0.17138013243675232, "learning_rate": 1.1527354377152053e-05, "loss": 0.14494270086288452, "step": 3800 } ], "logging_steps": 5, "max_steps": 5500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 2.0912782443288745e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }