{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.29112081513828236, "eval_steps": 100, "global_step": 1600, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0009097525473071324, "grad_norm": 1.0602493286132812, "learning_rate": 1.2121212121212122e-06, "loss": 1.7156932830810547, "step": 5 }, { "epoch": 0.001819505094614265, "grad_norm": 1.1577719449996948, "learning_rate": 2.7272727272727272e-06, "loss": 1.6629371643066406, "step": 10 }, { "epoch": 0.0027292576419213972, "grad_norm": 1.0288419723510742, "learning_rate": 4.242424242424243e-06, "loss": 1.6706295013427734, "step": 15 }, { "epoch": 0.00363901018922853, "grad_norm": 2.129403829574585, "learning_rate": 5.7575757575757586e-06, "loss": 1.7363752365112304, "step": 20 }, { "epoch": 0.004548762736535662, "grad_norm": 1.9468326568603516, "learning_rate": 7.272727272727272e-06, "loss": 1.7111135482788087, "step": 25 }, { "epoch": 0.0054585152838427945, "grad_norm": 1.1269357204437256, "learning_rate": 8.787878787878788e-06, "loss": 1.6924203872680663, "step": 30 }, { "epoch": 0.006368267831149927, "grad_norm": 1.4021248817443848, "learning_rate": 1.0303030303030304e-05, "loss": 1.658310317993164, "step": 35 }, { "epoch": 0.00727802037845706, "grad_norm": 1.313381314277649, "learning_rate": 1.1818181818181819e-05, "loss": 1.5383296012878418, "step": 40 }, { "epoch": 0.008187772925764192, "grad_norm": 2.4359891414642334, "learning_rate": 1.3333333333333333e-05, "loss": 1.4302565574645996, "step": 45 }, { "epoch": 0.009097525473071324, "grad_norm": 1.6459542512893677, "learning_rate": 1.484848484848485e-05, "loss": 1.2602953910827637, "step": 50 }, { "epoch": 0.010007278020378457, "grad_norm": 0.7953159213066101, "learning_rate": 1.6363636363636366e-05, "loss": 1.204326343536377, "step": 55 }, { "epoch": 0.010917030567685589, "grad_norm": 0.5824465155601501, "learning_rate": 1.787878787878788e-05, "loss": 1.068561840057373, "step": 60 }, { "epoch": 0.011826783114992722, "grad_norm": 0.39265626668930054, "learning_rate": 1.9393939393939395e-05, "loss": 0.9570062637329102, "step": 65 }, { "epoch": 0.012736535662299854, "grad_norm": 0.3387283384799957, "learning_rate": 2.090909090909091e-05, "loss": 0.9454713821411133, "step": 70 }, { "epoch": 0.013646288209606987, "grad_norm": 0.3182811141014099, "learning_rate": 2.2424242424242424e-05, "loss": 0.8901592254638672, "step": 75 }, { "epoch": 0.01455604075691412, "grad_norm": 0.2735312879085541, "learning_rate": 2.393939393939394e-05, "loss": 0.8491583824157715, "step": 80 }, { "epoch": 0.015465793304221253, "grad_norm": 0.2376435250043869, "learning_rate": 2.5454545454545454e-05, "loss": 0.8109179496765136, "step": 85 }, { "epoch": 0.016375545851528384, "grad_norm": 0.2161586880683899, "learning_rate": 2.696969696969697e-05, "loss": 0.76962308883667, "step": 90 }, { "epoch": 0.017285298398835518, "grad_norm": 0.19587980210781097, "learning_rate": 2.8484848484848486e-05, "loss": 0.7301986694335938, "step": 95 }, { "epoch": 0.018195050946142648, "grad_norm": 0.20971694588661194, "learning_rate": 3e-05, "loss": 0.7269618034362793, "step": 100 }, { "epoch": 0.018195050946142648, "eval_loss": 2.605874538421631, "eval_runtime": 1120.0905, "eval_samples_per_second": 33.935, "eval_steps_per_second": 8.484, "step": 100 }, { "epoch": 0.01910480349344978, "grad_norm": 0.10413152724504471, "learning_rate": 3.151515151515151e-05, "loss": 0.3250573635101318, "step": 105 }, { "epoch": 0.020014556040756915, "grad_norm": 0.09383206814527512, "learning_rate": 3.303030303030303e-05, "loss": 0.3277724742889404, "step": 110 }, { "epoch": 0.020924308588064048, "grad_norm": 0.1195850670337677, "learning_rate": 3.454545454545455e-05, "loss": 0.3215961217880249, "step": 115 }, { "epoch": 0.021834061135371178, "grad_norm": 0.0715397521853447, "learning_rate": 3.606060606060606e-05, "loss": 0.3120795965194702, "step": 120 }, { "epoch": 0.02274381368267831, "grad_norm": 0.068007692694664, "learning_rate": 3.757575757575758e-05, "loss": 0.2964257955551147, "step": 125 }, { "epoch": 0.023653566229985445, "grad_norm": 0.09345484524965286, "learning_rate": 3.909090909090909e-05, "loss": 0.30776252746582033, "step": 130 }, { "epoch": 0.024563318777292575, "grad_norm": 0.05577846243977547, "learning_rate": 4.0606060606060606e-05, "loss": 0.3180255889892578, "step": 135 }, { "epoch": 0.025473071324599708, "grad_norm": 0.05919989198446274, "learning_rate": 4.212121212121212e-05, "loss": 0.31608285903930666, "step": 140 }, { "epoch": 0.02638282387190684, "grad_norm": 0.05644674599170685, "learning_rate": 4.3636363636363636e-05, "loss": 0.2993780136108398, "step": 145 }, { "epoch": 0.027292576419213975, "grad_norm": 0.059986088424921036, "learning_rate": 4.515151515151516e-05, "loss": 0.2931638479232788, "step": 150 }, { "epoch": 0.028202328966521105, "grad_norm": 0.05941484495997429, "learning_rate": 4.666666666666667e-05, "loss": 0.29284651279449464, "step": 155 }, { "epoch": 0.02911208151382824, "grad_norm": 0.0579044483602047, "learning_rate": 4.8181818181818186e-05, "loss": 0.2927037000656128, "step": 160 }, { "epoch": 0.030021834061135372, "grad_norm": 0.061985693871974945, "learning_rate": 4.9696969696969694e-05, "loss": 0.28671720027923586, "step": 165 }, { "epoch": 0.030931586608442505, "grad_norm": 0.05715535953640938, "learning_rate": 4.999993064772809e-05, "loss": 0.2817929744720459, "step": 170 }, { "epoch": 0.03184133915574964, "grad_norm": 0.06549780815839767, "learning_rate": 4.999964890478288e-05, "loss": 0.27853829860687257, "step": 175 }, { "epoch": 0.03275109170305677, "grad_norm": 0.05948757752776146, "learning_rate": 4.999915043908795e-05, "loss": 0.27522289752960205, "step": 180 }, { "epoch": 0.0336608442503639, "grad_norm": 0.06262889504432678, "learning_rate": 4.9998435254964515e-05, "loss": 0.270997428894043, "step": 185 }, { "epoch": 0.034570596797671035, "grad_norm": 0.06916829943656921, "learning_rate": 4.999750335861253e-05, "loss": 0.2788438558578491, "step": 190 }, { "epoch": 0.035480349344978165, "grad_norm": 0.06128217652440071, "learning_rate": 4.9996354758110624e-05, "loss": 0.25649352073669435, "step": 195 }, { "epoch": 0.036390101892285295, "grad_norm": 0.06704027950763702, "learning_rate": 4.999498946341606e-05, "loss": 0.25619523525238036, "step": 200 }, { "epoch": 0.03729985443959243, "grad_norm": 0.061678580939769745, "learning_rate": 4.999340748636462e-05, "loss": 0.24956226348876953, "step": 205 }, { "epoch": 0.03820960698689956, "grad_norm": 0.07328873127698898, "learning_rate": 4.999160884067051e-05, "loss": 0.26169676780700685, "step": 210 }, { "epoch": 0.0391193595342067, "grad_norm": 0.08287990838289261, "learning_rate": 4.9989593541926246e-05, "loss": 0.2574604034423828, "step": 215 }, { "epoch": 0.04002911208151383, "grad_norm": 0.06787359714508057, "learning_rate": 4.9987361607602525e-05, "loss": 0.25351409912109374, "step": 220 }, { "epoch": 0.04093886462882096, "grad_norm": 0.06695502996444702, "learning_rate": 4.998491305704805e-05, "loss": 0.24522039890289307, "step": 225 }, { "epoch": 0.041848617176128096, "grad_norm": 0.08872214704751968, "learning_rate": 4.9982247911489375e-05, "loss": 0.2581867933273315, "step": 230 }, { "epoch": 0.042758369723435226, "grad_norm": 0.07637131959199905, "learning_rate": 4.9979366194030743e-05, "loss": 0.25569658279418944, "step": 235 }, { "epoch": 0.043668122270742356, "grad_norm": 0.08158119022846222, "learning_rate": 4.997626792965385e-05, "loss": 0.2529409646987915, "step": 240 }, { "epoch": 0.04457787481804949, "grad_norm": 0.07529161125421524, "learning_rate": 4.997295314521766e-05, "loss": 0.24049024581909179, "step": 245 }, { "epoch": 0.04548762736535662, "grad_norm": 0.08860139548778534, "learning_rate": 4.996942186945813e-05, "loss": 0.2490522861480713, "step": 250 }, { "epoch": 0.04639737991266375, "grad_norm": 0.0850321501493454, "learning_rate": 4.9965674132988005e-05, "loss": 0.24180831909179687, "step": 255 }, { "epoch": 0.04730713245997089, "grad_norm": 0.07556115090847015, "learning_rate": 4.996170996829653e-05, "loss": 0.2509631872177124, "step": 260 }, { "epoch": 0.04821688500727802, "grad_norm": 0.07971206307411194, "learning_rate": 4.995752940974918e-05, "loss": 0.24398891925811766, "step": 265 }, { "epoch": 0.04912663755458515, "grad_norm": 0.09149336814880371, "learning_rate": 4.9953132493587344e-05, "loss": 0.2300492286682129, "step": 270 }, { "epoch": 0.050036390101892286, "grad_norm": 0.08265820890665054, "learning_rate": 4.9948519257928034e-05, "loss": 0.24246792793273925, "step": 275 }, { "epoch": 0.050946142649199416, "grad_norm": 0.10328587144613266, "learning_rate": 4.9943689742763534e-05, "loss": 0.2367171049118042, "step": 280 }, { "epoch": 0.05185589519650655, "grad_norm": 0.0836917981505394, "learning_rate": 4.993864398996105e-05, "loss": 0.23215813636779786, "step": 285 }, { "epoch": 0.05276564774381368, "grad_norm": 0.09475161135196686, "learning_rate": 4.99333820432624e-05, "loss": 0.2350748062133789, "step": 290 }, { "epoch": 0.05367540029112081, "grad_norm": 0.08040128648281097, "learning_rate": 4.992790394828355e-05, "loss": 0.23253886699676513, "step": 295 }, { "epoch": 0.05458515283842795, "grad_norm": 0.08852150291204453, "learning_rate": 4.992220975251428e-05, "loss": 0.23856515884399415, "step": 300 }, { "epoch": 0.05549490538573508, "grad_norm": 0.09565229713916779, "learning_rate": 4.991629950531775e-05, "loss": 0.23311660289764405, "step": 305 }, { "epoch": 0.05640465793304221, "grad_norm": 0.08158160001039505, "learning_rate": 4.991017325793009e-05, "loss": 0.22467944622039795, "step": 310 }, { "epoch": 0.05731441048034935, "grad_norm": 0.07746429741382599, "learning_rate": 4.990383106345994e-05, "loss": 0.229844069480896, "step": 315 }, { "epoch": 0.05822416302765648, "grad_norm": 0.08564355969429016, "learning_rate": 4.989727297688797e-05, "loss": 0.22414517402648926, "step": 320 }, { "epoch": 0.05913391557496361, "grad_norm": 0.07517435401678085, "learning_rate": 4.9890499055066435e-05, "loss": 0.2236532211303711, "step": 325 }, { "epoch": 0.060043668122270744, "grad_norm": 0.111734539270401, "learning_rate": 4.988350935671869e-05, "loss": 0.21474847793579102, "step": 330 }, { "epoch": 0.060953420669577874, "grad_norm": 0.09906989336013794, "learning_rate": 4.987630394243866e-05, "loss": 0.23321933746337892, "step": 335 }, { "epoch": 0.06186317321688501, "grad_norm": 0.10131457448005676, "learning_rate": 4.98688828746903e-05, "loss": 0.2310662031173706, "step": 340 }, { "epoch": 0.06277292576419213, "grad_norm": 0.09203507006168365, "learning_rate": 4.986124621780708e-05, "loss": 0.22021169662475587, "step": 345 }, { "epoch": 0.06368267831149928, "grad_norm": 0.09505912661552429, "learning_rate": 4.9853394037991416e-05, "loss": 0.2197155237197876, "step": 350 }, { "epoch": 0.06459243085880641, "grad_norm": 0.09038657695055008, "learning_rate": 4.984532640331412e-05, "loss": 0.22066287994384765, "step": 355 }, { "epoch": 0.06550218340611354, "grad_norm": 0.09707064181566238, "learning_rate": 4.9837043383713753e-05, "loss": 0.22455451488494874, "step": 360 }, { "epoch": 0.06641193595342067, "grad_norm": 0.10367228090763092, "learning_rate": 4.98285450509961e-05, "loss": 0.21993820667266845, "step": 365 }, { "epoch": 0.0673216885007278, "grad_norm": 0.12229471653699875, "learning_rate": 4.9819831478833456e-05, "loss": 0.2168867588043213, "step": 370 }, { "epoch": 0.06823144104803494, "grad_norm": 0.0964592918753624, "learning_rate": 4.981090274276406e-05, "loss": 0.21579203605651856, "step": 375 }, { "epoch": 0.06914119359534207, "grad_norm": 0.09400496631860733, "learning_rate": 4.980175892019141e-05, "loss": 0.20972180366516113, "step": 380 }, { "epoch": 0.0700509461426492, "grad_norm": 0.08158645778894424, "learning_rate": 4.9792400090383594e-05, "loss": 0.22148358821868896, "step": 385 }, { "epoch": 0.07096069868995633, "grad_norm": 0.10916394740343094, "learning_rate": 4.978282633447261e-05, "loss": 0.2214418649673462, "step": 390 }, { "epoch": 0.07187045123726346, "grad_norm": 0.11138810962438583, "learning_rate": 4.9773037735453636e-05, "loss": 0.21814754009246826, "step": 395 }, { "epoch": 0.07278020378457059, "grad_norm": 0.10914396494626999, "learning_rate": 4.9763034378184365e-05, "loss": 0.21310818195343018, "step": 400 }, { "epoch": 0.07368995633187773, "grad_norm": 0.1043366864323616, "learning_rate": 4.975281634938421e-05, "loss": 0.21266789436340333, "step": 405 }, { "epoch": 0.07459970887918486, "grad_norm": 0.1036868542432785, "learning_rate": 4.9742383737633594e-05, "loss": 0.21606721878051757, "step": 410 }, { "epoch": 0.075509461426492, "grad_norm": 0.11640442907810211, "learning_rate": 4.9731736633373144e-05, "loss": 0.21532948017120362, "step": 415 }, { "epoch": 0.07641921397379912, "grad_norm": 0.11219926178455353, "learning_rate": 4.9720875128902956e-05, "loss": 0.2191627025604248, "step": 420 }, { "epoch": 0.07732896652110625, "grad_norm": 0.12103637307882309, "learning_rate": 4.970979931838176e-05, "loss": 0.20938868522644044, "step": 425 }, { "epoch": 0.0782387190684134, "grad_norm": 0.13274189829826355, "learning_rate": 4.96985092978261e-05, "loss": 0.21792960166931152, "step": 430 }, { "epoch": 0.07914847161572053, "grad_norm": 0.11164513230323792, "learning_rate": 4.968700516510954e-05, "loss": 0.2022618055343628, "step": 435 }, { "epoch": 0.08005822416302766, "grad_norm": 0.09532847255468369, "learning_rate": 4.967528701996174e-05, "loss": 0.21255812644958497, "step": 440 }, { "epoch": 0.08096797671033479, "grad_norm": 0.10279258340597153, "learning_rate": 4.96633549639677e-05, "loss": 0.20683050155639648, "step": 445 }, { "epoch": 0.08187772925764192, "grad_norm": 0.1257462352514267, "learning_rate": 4.965120910056677e-05, "loss": 0.21419920921325683, "step": 450 }, { "epoch": 0.08278748180494905, "grad_norm": 0.11663137376308441, "learning_rate": 4.963884953505186e-05, "loss": 0.2072287082672119, "step": 455 }, { "epoch": 0.08369723435225619, "grad_norm": 0.10488224029541016, "learning_rate": 4.96262763745684e-05, "loss": 0.1982678532600403, "step": 460 }, { "epoch": 0.08460698689956332, "grad_norm": 0.11801692098379135, "learning_rate": 4.961348972811354e-05, "loss": 0.20662031173706055, "step": 465 }, { "epoch": 0.08551673944687045, "grad_norm": 0.11318827420473099, "learning_rate": 4.96004897065351e-05, "loss": 0.20947303771972656, "step": 470 }, { "epoch": 0.08642649199417758, "grad_norm": 0.13409486413002014, "learning_rate": 4.95872764225307e-05, "loss": 0.19670876264572143, "step": 475 }, { "epoch": 0.08733624454148471, "grad_norm": 0.14440792798995972, "learning_rate": 4.957384999064672e-05, "loss": 0.19842848777770997, "step": 480 }, { "epoch": 0.08824599708879186, "grad_norm": 0.12246996909379959, "learning_rate": 4.956021052727731e-05, "loss": 0.20318071842193602, "step": 485 }, { "epoch": 0.08915574963609899, "grad_norm": 0.13437233865261078, "learning_rate": 4.954635815066342e-05, "loss": 0.21675212383270265, "step": 490 }, { "epoch": 0.09006550218340612, "grad_norm": 0.11109672486782074, "learning_rate": 4.9532292980891744e-05, "loss": 0.2100757837295532, "step": 495 }, { "epoch": 0.09097525473071325, "grad_norm": 0.1388893872499466, "learning_rate": 4.9518015139893675e-05, "loss": 0.20303285121917725, "step": 500 }, { "epoch": 0.09188500727802038, "grad_norm": 0.13239721953868866, "learning_rate": 4.950352475144427e-05, "loss": 0.2152268409729004, "step": 505 }, { "epoch": 0.0927947598253275, "grad_norm": 0.12834979593753815, "learning_rate": 4.948882194116119e-05, "loss": 0.20799248218536376, "step": 510 }, { "epoch": 0.09370451237263465, "grad_norm": 0.11886704713106155, "learning_rate": 4.947390683650354e-05, "loss": 0.20394976139068605, "step": 515 }, { "epoch": 0.09461426491994178, "grad_norm": 0.11398876458406448, "learning_rate": 4.945877956677083e-05, "loss": 0.2091092586517334, "step": 520 }, { "epoch": 0.09552401746724891, "grad_norm": 0.1422540694475174, "learning_rate": 4.944344026310186e-05, "loss": 0.19564238786697388, "step": 525 }, { "epoch": 0.09643377001455604, "grad_norm": 0.11359584331512451, "learning_rate": 4.9427889058473535e-05, "loss": 0.20493624210357667, "step": 530 }, { "epoch": 0.09734352256186317, "grad_norm": 0.11703553050756454, "learning_rate": 4.941212608769974e-05, "loss": 0.2098615884780884, "step": 535 }, { "epoch": 0.0982532751091703, "grad_norm": 0.14552047848701477, "learning_rate": 4.939615148743017e-05, "loss": 0.20382182598114013, "step": 540 }, { "epoch": 0.09916302765647744, "grad_norm": 0.13178016245365143, "learning_rate": 4.937996539614914e-05, "loss": 0.19901862144470214, "step": 545 }, { "epoch": 0.10007278020378457, "grad_norm": 0.635392427444458, "learning_rate": 4.936356795417439e-05, "loss": 0.20694944858551026, "step": 550 }, { "epoch": 0.1009825327510917, "grad_norm": 0.15019077062606812, "learning_rate": 4.934695930365586e-05, "loss": 0.19313746690750122, "step": 555 }, { "epoch": 0.10189228529839883, "grad_norm": 0.12941956520080566, "learning_rate": 4.9330139588574474e-05, "loss": 0.19671722650527954, "step": 560 }, { "epoch": 0.10280203784570596, "grad_norm": 0.13818831741809845, "learning_rate": 4.931310895474088e-05, "loss": 0.20026786327362062, "step": 565 }, { "epoch": 0.1037117903930131, "grad_norm": 0.12011194974184036, "learning_rate": 4.929586754979417e-05, "loss": 0.1932437539100647, "step": 570 }, { "epoch": 0.10462154294032024, "grad_norm": 0.1345364898443222, "learning_rate": 4.9278415523200644e-05, "loss": 0.20245940685272218, "step": 575 }, { "epoch": 0.10553129548762737, "grad_norm": 0.13281017541885376, "learning_rate": 4.926075302625247e-05, "loss": 0.19864981174468993, "step": 580 }, { "epoch": 0.1064410480349345, "grad_norm": 0.13465586304664612, "learning_rate": 4.924288021206639e-05, "loss": 0.19573183059692384, "step": 585 }, { "epoch": 0.10735080058224163, "grad_norm": 0.15225961804389954, "learning_rate": 4.9224797235582396e-05, "loss": 0.19946801662445068, "step": 590 }, { "epoch": 0.10826055312954876, "grad_norm": 0.12816746532917023, "learning_rate": 4.92065042535624e-05, "loss": 0.19851526021957397, "step": 595 }, { "epoch": 0.1091703056768559, "grad_norm": 0.13802853226661682, "learning_rate": 4.9188001424588824e-05, "loss": 0.19321763515472412, "step": 600 }, { "epoch": 0.11008005822416303, "grad_norm": 0.17504797875881195, "learning_rate": 4.9169288909063295e-05, "loss": 0.2032616138458252, "step": 605 }, { "epoch": 0.11098981077147016, "grad_norm": 0.13544194400310516, "learning_rate": 4.91503668692052e-05, "loss": 0.2011256456375122, "step": 610 }, { "epoch": 0.11189956331877729, "grad_norm": 1.3976134061813354, "learning_rate": 4.91312354690503e-05, "loss": 0.19916868209838867, "step": 615 }, { "epoch": 0.11280931586608442, "grad_norm": 0.1465059071779251, "learning_rate": 4.91118948744493e-05, "loss": 0.19487457275390624, "step": 620 }, { "epoch": 0.11371906841339156, "grad_norm": 0.12103168666362762, "learning_rate": 4.909234525306645e-05, "loss": 0.1907251238822937, "step": 625 }, { "epoch": 0.1146288209606987, "grad_norm": 0.12660574913024902, "learning_rate": 4.907258677437802e-05, "loss": 0.19327253103256226, "step": 630 }, { "epoch": 0.11553857350800582, "grad_norm": 0.1347813606262207, "learning_rate": 4.90526196096709e-05, "loss": 0.19637736082077026, "step": 635 }, { "epoch": 0.11644832605531295, "grad_norm": 0.14953652024269104, "learning_rate": 4.903244393204107e-05, "loss": 0.20325069427490233, "step": 640 }, { "epoch": 0.11735807860262008, "grad_norm": 0.13936272263526917, "learning_rate": 4.901205991639213e-05, "loss": 0.1930275321006775, "step": 645 }, { "epoch": 0.11826783114992721, "grad_norm": 0.1448420137166977, "learning_rate": 4.899146773943374e-05, "loss": 0.20026936531066894, "step": 650 }, { "epoch": 0.11917758369723436, "grad_norm": 0.1312534064054489, "learning_rate": 4.897066757968014e-05, "loss": 0.19062033891677857, "step": 655 }, { "epoch": 0.12008733624454149, "grad_norm": 0.13644742965698242, "learning_rate": 4.894965961744859e-05, "loss": 0.18719595670700073, "step": 660 }, { "epoch": 0.12099708879184862, "grad_norm": 0.14276087284088135, "learning_rate": 4.892844403485777e-05, "loss": 0.19784307479858398, "step": 665 }, { "epoch": 0.12190684133915575, "grad_norm": 0.14735399186611176, "learning_rate": 4.890702101582623e-05, "loss": 0.19163782596588136, "step": 670 }, { "epoch": 0.12281659388646288, "grad_norm": 0.15742065012454987, "learning_rate": 4.888539074607082e-05, "loss": 0.19312986135482788, "step": 675 }, { "epoch": 0.12372634643377002, "grad_norm": 0.12917031347751617, "learning_rate": 4.8863553413105025e-05, "loss": 0.20066320896148682, "step": 680 }, { "epoch": 0.12463609898107715, "grad_norm": 0.1484801322221756, "learning_rate": 4.884150920623737e-05, "loss": 0.20096096992492676, "step": 685 }, { "epoch": 0.12554585152838427, "grad_norm": 0.1455296128988266, "learning_rate": 4.88192583165698e-05, "loss": 0.20518505573272705, "step": 690 }, { "epoch": 0.12645560407569142, "grad_norm": 0.14517490565776825, "learning_rate": 4.879680093699598e-05, "loss": 0.18859238624572755, "step": 695 }, { "epoch": 0.12736535662299855, "grad_norm": 0.18778090178966522, "learning_rate": 4.877413726219964e-05, "loss": 0.197074818611145, "step": 700 }, { "epoch": 0.12827510917030568, "grad_norm": 0.13497677445411682, "learning_rate": 4.87512674886529e-05, "loss": 0.18713107109069824, "step": 705 }, { "epoch": 0.12918486171761281, "grad_norm": 0.12657155096530914, "learning_rate": 4.872819181461455e-05, "loss": 0.1858484387397766, "step": 710 }, { "epoch": 0.13009461426491994, "grad_norm": 0.11458148807287216, "learning_rate": 4.870491044012834e-05, "loss": 0.18732179403305055, "step": 715 }, { "epoch": 0.13100436681222707, "grad_norm": 0.13000249862670898, "learning_rate": 4.8681423567021244e-05, "loss": 0.1872936010360718, "step": 720 }, { "epoch": 0.1319141193595342, "grad_norm": 0.14580890536308289, "learning_rate": 4.865773139890172e-05, "loss": 0.19280019998550416, "step": 725 }, { "epoch": 0.13282387190684133, "grad_norm": 0.1507277935743332, "learning_rate": 4.8633834141157913e-05, "loss": 0.1898929238319397, "step": 730 }, { "epoch": 0.13373362445414846, "grad_norm": 0.1418737769126892, "learning_rate": 4.860973200095592e-05, "loss": 0.17926375865936278, "step": 735 }, { "epoch": 0.1346433770014556, "grad_norm": 0.17151866853237152, "learning_rate": 4.858542518723794e-05, "loss": 0.18963592052459716, "step": 740 }, { "epoch": 0.13555312954876272, "grad_norm": 0.11162743717432022, "learning_rate": 4.8560913910720535e-05, "loss": 0.19466646909713745, "step": 745 }, { "epoch": 0.13646288209606988, "grad_norm": 0.15628376603126526, "learning_rate": 4.8536198383892725e-05, "loss": 0.19494034051895143, "step": 750 }, { "epoch": 0.137372634643377, "grad_norm": 0.18209289014339447, "learning_rate": 4.851127882101421e-05, "loss": 0.18747550249099731, "step": 755 }, { "epoch": 0.13828238719068414, "grad_norm": 0.14559614658355713, "learning_rate": 4.8486155438113454e-05, "loss": 0.1897158980369568, "step": 760 }, { "epoch": 0.13919213973799127, "grad_norm": 0.3198587894439697, "learning_rate": 4.846082845298586e-05, "loss": 0.18571001291275024, "step": 765 }, { "epoch": 0.1401018922852984, "grad_norm": 0.1486678421497345, "learning_rate": 4.843529808519189e-05, "loss": 0.19561930894851684, "step": 770 }, { "epoch": 0.14101164483260553, "grad_norm": 0.15318170189857483, "learning_rate": 4.840956455605509e-05, "loss": 0.187040114402771, "step": 775 }, { "epoch": 0.14192139737991266, "grad_norm": 0.13754244148731232, "learning_rate": 4.838362808866025e-05, "loss": 0.18345539569854735, "step": 780 }, { "epoch": 0.1428311499272198, "grad_norm": 0.12943248450756073, "learning_rate": 4.835748890785143e-05, "loss": 0.1921079397201538, "step": 785 }, { "epoch": 0.14374090247452692, "grad_norm": 0.110458143055439, "learning_rate": 4.833114724023001e-05, "loss": 0.17927205562591553, "step": 790 }, { "epoch": 0.14465065502183405, "grad_norm": 0.2421770840883255, "learning_rate": 4.830460331415275e-05, "loss": 0.18317567110061644, "step": 795 }, { "epoch": 0.14556040756914118, "grad_norm": 0.14752762019634247, "learning_rate": 4.8277857359729787e-05, "loss": 0.1843916058540344, "step": 800 }, { "epoch": 0.14647016011644834, "grad_norm": 0.15043556690216064, "learning_rate": 4.8250909608822644e-05, "loss": 0.18354393243789674, "step": 805 }, { "epoch": 0.14737991266375547, "grad_norm": 0.1381794661283493, "learning_rate": 4.822376029504223e-05, "loss": 0.1789781332015991, "step": 810 }, { "epoch": 0.1482896652110626, "grad_norm": 0.18386174738407135, "learning_rate": 4.819640965374681e-05, "loss": 0.19494292736053467, "step": 815 }, { "epoch": 0.14919941775836973, "grad_norm": 0.13829593360424042, "learning_rate": 4.816885792203996e-05, "loss": 0.18486063480377196, "step": 820 }, { "epoch": 0.15010917030567686, "grad_norm": 0.15033291280269623, "learning_rate": 4.814110533876852e-05, "loss": 0.18061509132385253, "step": 825 }, { "epoch": 0.151018922852984, "grad_norm": 0.17150473594665527, "learning_rate": 4.811315214452051e-05, "loss": 0.18464866876602173, "step": 830 }, { "epoch": 0.15192867540029112, "grad_norm": 0.15317125618457794, "learning_rate": 4.808499858162307e-05, "loss": 0.1837708592414856, "step": 835 }, { "epoch": 0.15283842794759825, "grad_norm": 0.2671392560005188, "learning_rate": 4.805664489414031e-05, "loss": 0.19338636398315429, "step": 840 }, { "epoch": 0.15374818049490538, "grad_norm": 0.14047028124332428, "learning_rate": 4.802809132787125e-05, "loss": 0.17069108486175538, "step": 845 }, { "epoch": 0.1546579330422125, "grad_norm": 0.1520431935787201, "learning_rate": 4.799933813034768e-05, "loss": 0.18607735633850098, "step": 850 }, { "epoch": 0.15556768558951964, "grad_norm": 0.17239463329315186, "learning_rate": 4.797038555083197e-05, "loss": 0.18069062232971192, "step": 855 }, { "epoch": 0.1564774381368268, "grad_norm": 0.1377955675125122, "learning_rate": 4.794123384031495e-05, "loss": 0.18870222568511963, "step": 860 }, { "epoch": 0.15738719068413393, "grad_norm": 0.15901461243629456, "learning_rate": 4.791188325151373e-05, "loss": 0.18128334283828734, "step": 865 }, { "epoch": 0.15829694323144106, "grad_norm": 0.14634132385253906, "learning_rate": 4.7882334038869495e-05, "loss": 0.1866163969039917, "step": 870 }, { "epoch": 0.1592066957787482, "grad_norm": 0.15361061692237854, "learning_rate": 4.785258645854529e-05, "loss": 0.17850807905197144, "step": 875 }, { "epoch": 0.16011644832605532, "grad_norm": 0.13751649856567383, "learning_rate": 4.782264076842385e-05, "loss": 0.17731113433837892, "step": 880 }, { "epoch": 0.16102620087336245, "grad_norm": 0.17909638583660126, "learning_rate": 4.7792497228105314e-05, "loss": 0.18344542980194092, "step": 885 }, { "epoch": 0.16193595342066958, "grad_norm": 0.16038304567337036, "learning_rate": 4.776215609890498e-05, "loss": 0.18868647813796996, "step": 890 }, { "epoch": 0.1628457059679767, "grad_norm": 0.1653951108455658, "learning_rate": 4.773161764385107e-05, "loss": 0.18614152669906617, "step": 895 }, { "epoch": 0.16375545851528384, "grad_norm": 0.16193026304244995, "learning_rate": 4.770088212768241e-05, "loss": 0.18564575910568237, "step": 900 }, { "epoch": 0.16466521106259097, "grad_norm": 0.16048531234264374, "learning_rate": 4.7669949816846173e-05, "loss": 0.18330031633377075, "step": 905 }, { "epoch": 0.1655749636098981, "grad_norm": 0.1440177708864212, "learning_rate": 4.7638820979495534e-05, "loss": 0.17712442874908446, "step": 910 }, { "epoch": 0.16648471615720525, "grad_norm": 0.19635969400405884, "learning_rate": 4.760749588548738e-05, "loss": 0.18679027557373046, "step": 915 }, { "epoch": 0.16739446870451238, "grad_norm": 0.15576541423797607, "learning_rate": 4.757597480637995e-05, "loss": 0.19283764362335204, "step": 920 }, { "epoch": 0.1683042212518195, "grad_norm": 0.1550331562757492, "learning_rate": 4.7544258015430463e-05, "loss": 0.18269542455673218, "step": 925 }, { "epoch": 0.16921397379912664, "grad_norm": 0.18369626998901367, "learning_rate": 4.75123457875928e-05, "loss": 0.1697891116142273, "step": 930 }, { "epoch": 0.17012372634643377, "grad_norm": 0.15266314148902893, "learning_rate": 4.7480238399515074e-05, "loss": 0.18523451089859008, "step": 935 }, { "epoch": 0.1710334788937409, "grad_norm": 0.16709664463996887, "learning_rate": 4.744793612953724e-05, "loss": 0.1803238034248352, "step": 940 }, { "epoch": 0.17194323144104803, "grad_norm": 0.14929179847240448, "learning_rate": 4.741543925768872e-05, "loss": 0.1861217737197876, "step": 945 }, { "epoch": 0.17285298398835516, "grad_norm": 0.1362280696630478, "learning_rate": 4.7382748065685915e-05, "loss": 0.17896100282669067, "step": 950 }, { "epoch": 0.1737627365356623, "grad_norm": 0.15290239453315735, "learning_rate": 4.734986283692982e-05, "loss": 0.18432788848876952, "step": 955 }, { "epoch": 0.17467248908296942, "grad_norm": 0.1287035197019577, "learning_rate": 4.73167838565035e-05, "loss": 0.18485682010650634, "step": 960 }, { "epoch": 0.17558224163027655, "grad_norm": 0.17969627678394318, "learning_rate": 4.728351141116971e-05, "loss": 0.17361557483673096, "step": 965 }, { "epoch": 0.1764919941775837, "grad_norm": 0.13751201331615448, "learning_rate": 4.7250045789368326e-05, "loss": 0.1731679320335388, "step": 970 }, { "epoch": 0.17740174672489084, "grad_norm": 0.1603265255689621, "learning_rate": 4.721638728121388e-05, "loss": 0.17308170795440675, "step": 975 }, { "epoch": 0.17831149927219797, "grad_norm": 0.1592789888381958, "learning_rate": 4.718253617849306e-05, "loss": 0.17534757852554322, "step": 980 }, { "epoch": 0.1792212518195051, "grad_norm": 0.12727224826812744, "learning_rate": 4.714849277466214e-05, "loss": 0.17817609310150145, "step": 985 }, { "epoch": 0.18013100436681223, "grad_norm": 0.15401554107666016, "learning_rate": 4.711425736484447e-05, "loss": 0.1733405351638794, "step": 990 }, { "epoch": 0.18104075691411936, "grad_norm": 0.13253968954086304, "learning_rate": 4.7079830245827906e-05, "loss": 0.17846795320510864, "step": 995 }, { "epoch": 0.1819505094614265, "grad_norm": 0.21846213936805725, "learning_rate": 4.7045211716062245e-05, "loss": 0.18021599054336548, "step": 1000 }, { "epoch": 0.18286026200873362, "grad_norm": 0.16867990791797638, "learning_rate": 4.7010402075656595e-05, "loss": 0.18232386112213134, "step": 1005 }, { "epoch": 0.18377001455604075, "grad_norm": 0.17180582880973816, "learning_rate": 4.697540162637686e-05, "loss": 0.1816317319869995, "step": 1010 }, { "epoch": 0.18467976710334788, "grad_norm": 0.16480213403701782, "learning_rate": 4.694021067164303e-05, "loss": 0.17718446254730225, "step": 1015 }, { "epoch": 0.185589519650655, "grad_norm": 0.15015918016433716, "learning_rate": 4.6904829516526605e-05, "loss": 0.17412011623382567, "step": 1020 }, { "epoch": 0.18649927219796217, "grad_norm": 0.14445139467716217, "learning_rate": 4.686925846774795e-05, "loss": 0.1778018832206726, "step": 1025 }, { "epoch": 0.1874090247452693, "grad_norm": 0.1701960265636444, "learning_rate": 4.683349783367362e-05, "loss": 0.16901081800460815, "step": 1030 }, { "epoch": 0.18831877729257643, "grad_norm": 0.15894867479801178, "learning_rate": 4.679754792431368e-05, "loss": 0.17055928707122803, "step": 1035 }, { "epoch": 0.18922852983988356, "grad_norm": 0.1511942446231842, "learning_rate": 4.676140905131903e-05, "loss": 0.17339680194854737, "step": 1040 }, { "epoch": 0.1901382823871907, "grad_norm": 0.14735209941864014, "learning_rate": 4.672508152797872e-05, "loss": 0.17802717685699462, "step": 1045 }, { "epoch": 0.19104803493449782, "grad_norm": 0.17367291450500488, "learning_rate": 4.66885656692172e-05, "loss": 0.1732744097709656, "step": 1050 }, { "epoch": 0.19195778748180495, "grad_norm": 0.147227481007576, "learning_rate": 4.665186179159159e-05, "loss": 0.17040517330169677, "step": 1055 }, { "epoch": 0.19286754002911208, "grad_norm": 0.1709655076265335, "learning_rate": 4.6614970213289e-05, "loss": 0.17794088125228882, "step": 1060 }, { "epoch": 0.1937772925764192, "grad_norm": 0.1588088721036911, "learning_rate": 4.657789125412366e-05, "loss": 0.17180380821228028, "step": 1065 }, { "epoch": 0.19468704512372634, "grad_norm": 0.14827021956443787, "learning_rate": 4.654062523553428e-05, "loss": 0.182997989654541, "step": 1070 }, { "epoch": 0.19559679767103347, "grad_norm": 0.16230466961860657, "learning_rate": 4.6503172480581126e-05, "loss": 0.17346880435943604, "step": 1075 }, { "epoch": 0.1965065502183406, "grad_norm": 0.1637624353170395, "learning_rate": 4.646553331394333e-05, "loss": 0.17263576984405518, "step": 1080 }, { "epoch": 0.19741630276564776, "grad_norm": 0.15977843105793, "learning_rate": 4.642770806191603e-05, "loss": 0.17284308671951293, "step": 1085 }, { "epoch": 0.19832605531295489, "grad_norm": 0.15394869446754456, "learning_rate": 4.6389697052407534e-05, "loss": 0.17797101736068727, "step": 1090 }, { "epoch": 0.19923580786026202, "grad_norm": 0.15995225310325623, "learning_rate": 4.6351500614936485e-05, "loss": 0.18137198686599731, "step": 1095 }, { "epoch": 0.20014556040756915, "grad_norm": 0.1779479682445526, "learning_rate": 4.6313119080629006e-05, "loss": 0.17998344898223878, "step": 1100 }, { "epoch": 0.20105531295487628, "grad_norm": 0.14362832903862, "learning_rate": 4.627455278221584e-05, "loss": 0.18196423053741456, "step": 1105 }, { "epoch": 0.2019650655021834, "grad_norm": 0.15951639413833618, "learning_rate": 4.623580205402947e-05, "loss": 0.17423888444900512, "step": 1110 }, { "epoch": 0.20287481804949054, "grad_norm": 0.17273563146591187, "learning_rate": 4.619686723200115e-05, "loss": 0.17392473220825194, "step": 1115 }, { "epoch": 0.20378457059679767, "grad_norm": 0.1655360758304596, "learning_rate": 4.615774865365813e-05, "loss": 0.17528389692306517, "step": 1120 }, { "epoch": 0.2046943231441048, "grad_norm": 0.15920691192150116, "learning_rate": 4.611844665812058e-05, "loss": 0.1806849241256714, "step": 1125 }, { "epoch": 0.20560407569141192, "grad_norm": 0.16114577651023865, "learning_rate": 4.607896158609875e-05, "loss": 0.17217352390289306, "step": 1130 }, { "epoch": 0.20651382823871905, "grad_norm": 0.1499422937631607, "learning_rate": 4.603929377988999e-05, "loss": 0.17806737422943114, "step": 1135 }, { "epoch": 0.2074235807860262, "grad_norm": 0.17605191469192505, "learning_rate": 4.5999443583375765e-05, "loss": 0.17842113971710205, "step": 1140 }, { "epoch": 0.20833333333333334, "grad_norm": 0.16117210686206818, "learning_rate": 4.595941134201871e-05, "loss": 0.18379683494567872, "step": 1145 }, { "epoch": 0.20924308588064047, "grad_norm": 0.21199050545692444, "learning_rate": 4.591919740285957e-05, "loss": 0.16286123991012574, "step": 1150 }, { "epoch": 0.2101528384279476, "grad_norm": 0.15100529789924622, "learning_rate": 4.587880211451427e-05, "loss": 0.17995200157165528, "step": 1155 }, { "epoch": 0.21106259097525473, "grad_norm": 0.16618172824382782, "learning_rate": 4.583822582717085e-05, "loss": 0.16960303783416747, "step": 1160 }, { "epoch": 0.21197234352256186, "grad_norm": 0.14743569493293762, "learning_rate": 4.579746889258643e-05, "loss": 0.17762668132781984, "step": 1165 }, { "epoch": 0.212882096069869, "grad_norm": 0.1697179079055786, "learning_rate": 4.575653166408417e-05, "loss": 0.16656005382537842, "step": 1170 }, { "epoch": 0.21379184861717612, "grad_norm": 0.14886513352394104, "learning_rate": 4.57154144965502e-05, "loss": 0.17091882228851318, "step": 1175 }, { "epoch": 0.21470160116448325, "grad_norm": 0.18197473883628845, "learning_rate": 4.5674117746430556e-05, "loss": 0.1770920753479004, "step": 1180 }, { "epoch": 0.21561135371179038, "grad_norm": 0.17323088645935059, "learning_rate": 4.563264177172807e-05, "loss": 0.1734643578529358, "step": 1185 }, { "epoch": 0.2165211062590975, "grad_norm": 0.1521984338760376, "learning_rate": 4.559098693199929e-05, "loss": 0.17515116930007935, "step": 1190 }, { "epoch": 0.21743085880640467, "grad_norm": 0.1842304915189743, "learning_rate": 4.554915358835134e-05, "loss": 0.16798022985458375, "step": 1195 }, { "epoch": 0.2183406113537118, "grad_norm": 0.14753451943397522, "learning_rate": 4.5507142103438794e-05, "loss": 0.1755476713180542, "step": 1200 }, { "epoch": 0.21925036390101893, "grad_norm": 0.17096194624900818, "learning_rate": 4.546495284146057e-05, "loss": 0.1792473554611206, "step": 1205 }, { "epoch": 0.22016011644832606, "grad_norm": 0.1579233556985855, "learning_rate": 4.542258616815669e-05, "loss": 0.17230144739151002, "step": 1210 }, { "epoch": 0.2210698689956332, "grad_norm": 0.177297905087471, "learning_rate": 4.5380042450805216e-05, "loss": 0.1807127833366394, "step": 1215 }, { "epoch": 0.22197962154294032, "grad_norm": 0.14331696927547455, "learning_rate": 4.533732205821897e-05, "loss": 0.17201389074325563, "step": 1220 }, { "epoch": 0.22288937409024745, "grad_norm": 0.14473360776901245, "learning_rate": 4.529442536074239e-05, "loss": 0.17036900520324708, "step": 1225 }, { "epoch": 0.22379912663755458, "grad_norm": 0.1820901483297348, "learning_rate": 4.5251352730248314e-05, "loss": 0.17704882621765136, "step": 1230 }, { "epoch": 0.2247088791848617, "grad_norm": 0.1948976367712021, "learning_rate": 4.5208104540134746e-05, "loss": 0.1706973433494568, "step": 1235 }, { "epoch": 0.22561863173216884, "grad_norm": 0.16660070419311523, "learning_rate": 4.51646811653216e-05, "loss": 0.17636821269989014, "step": 1240 }, { "epoch": 0.22652838427947597, "grad_norm": 0.1699984073638916, "learning_rate": 4.512108298224751e-05, "loss": 0.16986632347106934, "step": 1245 }, { "epoch": 0.22743813682678313, "grad_norm": 0.17601042985916138, "learning_rate": 4.50773103688665e-05, "loss": 0.17507898807525635, "step": 1250 }, { "epoch": 0.22834788937409026, "grad_norm": 0.17557238042354584, "learning_rate": 4.503336370464476e-05, "loss": 0.17702863216400147, "step": 1255 }, { "epoch": 0.2292576419213974, "grad_norm": 0.1800651252269745, "learning_rate": 4.498924337055729e-05, "loss": 0.16419180631637573, "step": 1260 }, { "epoch": 0.23016739446870452, "grad_norm": 0.2022479772567749, "learning_rate": 4.494494974908468e-05, "loss": 0.17482060194015503, "step": 1265 }, { "epoch": 0.23107714701601165, "grad_norm": 0.14180205762386322, "learning_rate": 4.490048322420973e-05, "loss": 0.1723136067390442, "step": 1270 }, { "epoch": 0.23198689956331878, "grad_norm": 0.18607310950756073, "learning_rate": 4.485584418141419e-05, "loss": 0.17096419334411622, "step": 1275 }, { "epoch": 0.2328966521106259, "grad_norm": 0.15958310663700104, "learning_rate": 4.481103300767529e-05, "loss": 0.1656244158744812, "step": 1280 }, { "epoch": 0.23380640465793304, "grad_norm": 0.17552383244037628, "learning_rate": 4.476605009146255e-05, "loss": 0.17677626609802247, "step": 1285 }, { "epoch": 0.23471615720524017, "grad_norm": 0.15299823880195618, "learning_rate": 4.472089582273429e-05, "loss": 0.1778991103172302, "step": 1290 }, { "epoch": 0.2356259097525473, "grad_norm": 0.14613987505435944, "learning_rate": 4.46755705929343e-05, "loss": 0.17071452140808105, "step": 1295 }, { "epoch": 0.23653566229985443, "grad_norm": 0.17781122028827667, "learning_rate": 4.463007479498843e-05, "loss": 0.16955430507659913, "step": 1300 }, { "epoch": 0.23744541484716158, "grad_norm": 0.16326487064361572, "learning_rate": 4.458440882330119e-05, "loss": 0.1777693510055542, "step": 1305 }, { "epoch": 0.23835516739446871, "grad_norm": 0.17701926827430725, "learning_rate": 4.4538573073752365e-05, "loss": 0.16323351860046387, "step": 1310 }, { "epoch": 0.23926491994177584, "grad_norm": 0.13104717433452606, "learning_rate": 4.449256794369349e-05, "loss": 0.17653456926345826, "step": 1315 }, { "epoch": 0.24017467248908297, "grad_norm": 0.1796836256980896, "learning_rate": 4.444639383194452e-05, "loss": 0.17189600467681884, "step": 1320 }, { "epoch": 0.2410844250363901, "grad_norm": 0.14919696748256683, "learning_rate": 4.440005113879029e-05, "loss": 0.17003334760665895, "step": 1325 }, { "epoch": 0.24199417758369723, "grad_norm": 0.1728784441947937, "learning_rate": 4.4353540265977064e-05, "loss": 0.17397408485412597, "step": 1330 }, { "epoch": 0.24290393013100436, "grad_norm": 0.14591015875339508, "learning_rate": 4.43068616167091e-05, "loss": 0.16498478651046752, "step": 1335 }, { "epoch": 0.2438136826783115, "grad_norm": 0.18417201936244965, "learning_rate": 4.4260015595645055e-05, "loss": 0.16841750144958495, "step": 1340 }, { "epoch": 0.24472343522561862, "grad_norm": 0.16264279186725616, "learning_rate": 4.4213002608894605e-05, "loss": 0.16907373666763306, "step": 1345 }, { "epoch": 0.24563318777292575, "grad_norm": 0.15248481929302216, "learning_rate": 4.416582306401481e-05, "loss": 0.15931472778320313, "step": 1350 }, { "epoch": 0.24654294032023288, "grad_norm": 0.1488373875617981, "learning_rate": 4.4118477370006636e-05, "loss": 0.1701716423034668, "step": 1355 }, { "epoch": 0.24745269286754004, "grad_norm": 0.14679782092571259, "learning_rate": 4.407096593731142e-05, "loss": 0.157412326335907, "step": 1360 }, { "epoch": 0.24836244541484717, "grad_norm": 0.17139530181884766, "learning_rate": 4.402328917780728e-05, "loss": 0.17303754091262818, "step": 1365 }, { "epoch": 0.2492721979621543, "grad_norm": 0.1534871757030487, "learning_rate": 4.397544750480554e-05, "loss": 0.1786255121231079, "step": 1370 }, { "epoch": 0.2501819505094614, "grad_norm": 0.1876252293586731, "learning_rate": 4.39274413330472e-05, "loss": 0.16442898511886597, "step": 1375 }, { "epoch": 0.25109170305676853, "grad_norm": 0.16165752708911896, "learning_rate": 4.387927107869928e-05, "loss": 0.1780426025390625, "step": 1380 }, { "epoch": 0.25200145560407566, "grad_norm": 0.17242255806922913, "learning_rate": 4.383093715935124e-05, "loss": 0.15959256887435913, "step": 1385 }, { "epoch": 0.25291120815138285, "grad_norm": 0.1627114862203598, "learning_rate": 4.378243999401137e-05, "loss": 0.17606115341186523, "step": 1390 }, { "epoch": 0.25382096069869, "grad_norm": 0.15911224484443665, "learning_rate": 4.373378000310312e-05, "loss": 0.16798585653305054, "step": 1395 }, { "epoch": 0.2547307132459971, "grad_norm": 0.15542249381542206, "learning_rate": 4.3684957608461505e-05, "loss": 0.1695417881011963, "step": 1400 }, { "epoch": 0.25564046579330424, "grad_norm": 0.1475304812192917, "learning_rate": 4.363597323332941e-05, "loss": 0.16340878009796142, "step": 1405 }, { "epoch": 0.25655021834061137, "grad_norm": 0.16943927109241486, "learning_rate": 4.358682730235395e-05, "loss": 0.17240238189697266, "step": 1410 }, { "epoch": 0.2574599708879185, "grad_norm": 0.1816391944885254, "learning_rate": 4.3537520241582744e-05, "loss": 0.16558437347412108, "step": 1415 }, { "epoch": 0.25836972343522563, "grad_norm": 0.23851341009140015, "learning_rate": 4.348805247846027e-05, "loss": 0.16796000003814698, "step": 1420 }, { "epoch": 0.25927947598253276, "grad_norm": 0.15415243804454803, "learning_rate": 4.343842444182414e-05, "loss": 0.1746017098426819, "step": 1425 }, { "epoch": 0.2601892285298399, "grad_norm": 0.15651032328605652, "learning_rate": 4.338863656190139e-05, "loss": 0.1649057984352112, "step": 1430 }, { "epoch": 0.261098981077147, "grad_norm": 0.16601966321468353, "learning_rate": 4.333868927030471e-05, "loss": 0.15888988971710205, "step": 1435 }, { "epoch": 0.26200873362445415, "grad_norm": 0.1549467295408249, "learning_rate": 4.328858300002876e-05, "loss": 0.16357985734939576, "step": 1440 }, { "epoch": 0.2629184861717613, "grad_norm": 0.16332370042800903, "learning_rate": 4.32383181854464e-05, "loss": 0.16749982833862304, "step": 1445 }, { "epoch": 0.2638282387190684, "grad_norm": 0.14827077090740204, "learning_rate": 4.3187895262304894e-05, "loss": 0.16886214017868043, "step": 1450 }, { "epoch": 0.26473799126637554, "grad_norm": 0.1557198166847229, "learning_rate": 4.313731466772216e-05, "loss": 0.17512214183807373, "step": 1455 }, { "epoch": 0.26564774381368267, "grad_norm": 0.17263570427894592, "learning_rate": 4.308657684018299e-05, "loss": 0.16248074769973755, "step": 1460 }, { "epoch": 0.2665574963609898, "grad_norm": 0.17135761678218842, "learning_rate": 4.303568221953521e-05, "loss": 0.16605921983718872, "step": 1465 }, { "epoch": 0.26746724890829693, "grad_norm": 0.14322632551193237, "learning_rate": 4.2984631246985897e-05, "loss": 0.1610772728919983, "step": 1470 }, { "epoch": 0.26837700145560406, "grad_norm": 0.18852312862873077, "learning_rate": 4.2933424365097564e-05, "loss": 0.1686462163925171, "step": 1475 }, { "epoch": 0.2692867540029112, "grad_norm": 0.1780245155096054, "learning_rate": 4.2882062017784294e-05, "loss": 0.16953932046890258, "step": 1480 }, { "epoch": 0.2701965065502183, "grad_norm": 0.180568665266037, "learning_rate": 4.2830544650307895e-05, "loss": 0.16442664861679077, "step": 1485 }, { "epoch": 0.27110625909752545, "grad_norm": 0.16876435279846191, "learning_rate": 4.277887270927407e-05, "loss": 0.17128173112869263, "step": 1490 }, { "epoch": 0.2720160116448326, "grad_norm": 0.164053276181221, "learning_rate": 4.2727046642628513e-05, "loss": 0.16331382989883422, "step": 1495 }, { "epoch": 0.27292576419213976, "grad_norm": 0.14577528834342957, "learning_rate": 4.267506689965305e-05, "loss": 0.1638316035270691, "step": 1500 }, { "epoch": 0.2738355167394469, "grad_norm": 0.1648740917444229, "learning_rate": 4.262293393096171e-05, "loss": 0.15332664251327516, "step": 1505 }, { "epoch": 0.274745269286754, "grad_norm": 0.16445094347000122, "learning_rate": 4.257064818849685e-05, "loss": 0.1706634521484375, "step": 1510 }, { "epoch": 0.27565502183406115, "grad_norm": 0.1584935486316681, "learning_rate": 4.251821012552524e-05, "loss": 0.1684114694595337, "step": 1515 }, { "epoch": 0.2765647743813683, "grad_norm": 0.17215611040592194, "learning_rate": 4.24656201966341e-05, "loss": 0.15594131946563722, "step": 1520 }, { "epoch": 0.2774745269286754, "grad_norm": 0.15945589542388916, "learning_rate": 4.2412878857727214e-05, "loss": 0.1686659574508667, "step": 1525 }, { "epoch": 0.27838427947598254, "grad_norm": 0.16103951632976532, "learning_rate": 4.2359986566020906e-05, "loss": 0.17779340744018554, "step": 1530 }, { "epoch": 0.2792940320232897, "grad_norm": 0.1770307570695877, "learning_rate": 4.230694378004014e-05, "loss": 0.16786882877349854, "step": 1535 }, { "epoch": 0.2802037845705968, "grad_norm": 0.16225053369998932, "learning_rate": 4.2253750959614504e-05, "loss": 0.16558897495269775, "step": 1540 }, { "epoch": 0.28111353711790393, "grad_norm": 0.27213969826698303, "learning_rate": 4.220040856587425e-05, "loss": 0.1641119599342346, "step": 1545 }, { "epoch": 0.28202328966521106, "grad_norm": 0.1773071587085724, "learning_rate": 4.2146917061246284e-05, "loss": 0.16919140815734862, "step": 1550 }, { "epoch": 0.2829330422125182, "grad_norm": 0.15519705414772034, "learning_rate": 4.209327690945014e-05, "loss": 0.15501506328582765, "step": 1555 }, { "epoch": 0.2838427947598253, "grad_norm": 0.19921597838401794, "learning_rate": 4.203948857549402e-05, "loss": 0.1690821886062622, "step": 1560 }, { "epoch": 0.28475254730713245, "grad_norm": 0.15417630970478058, "learning_rate": 4.1985552525670696e-05, "loss": 0.1675640344619751, "step": 1565 }, { "epoch": 0.2856622998544396, "grad_norm": 0.1739572137594223, "learning_rate": 4.193146922755348e-05, "loss": 0.16738017797470092, "step": 1570 }, { "epoch": 0.2865720524017467, "grad_norm": 0.1384361982345581, "learning_rate": 4.187723914999221e-05, "loss": 0.16802358627319336, "step": 1575 }, { "epoch": 0.28748180494905384, "grad_norm": 0.1491454839706421, "learning_rate": 4.182286276310915e-05, "loss": 0.1619583249092102, "step": 1580 }, { "epoch": 0.288391557496361, "grad_norm": 0.15831919014453888, "learning_rate": 4.176834053829492e-05, "loss": 0.1625199794769287, "step": 1585 }, { "epoch": 0.2893013100436681, "grad_norm": 0.16265396773815155, "learning_rate": 4.1713672948204416e-05, "loss": 0.16718552112579346, "step": 1590 }, { "epoch": 0.29021106259097523, "grad_norm": 0.15153461694717407, "learning_rate": 4.1658860466752714e-05, "loss": 0.15979087352752686, "step": 1595 }, { "epoch": 0.29112081513828236, "grad_norm": 0.1620412915945053, "learning_rate": 4.160390356911096e-05, "loss": 0.16103557348251343, "step": 1600 } ], "logging_steps": 5, "max_steps": 5500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.860485929468659e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }