RohithMidigudla's picture
Upload folder using huggingface_hub
e28bc94 verified
Raw
History Blame Contribute Delete
76.1 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.363901018922853,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009097525473071324,
"grad_norm": 1.0602493286132812,
"learning_rate": 1.2121212121212122e-06,
"loss": 1.7156932830810547,
"step": 5
},
{
"epoch": 0.001819505094614265,
"grad_norm": 1.1577719449996948,
"learning_rate": 2.7272727272727272e-06,
"loss": 1.6629371643066406,
"step": 10
},
{
"epoch": 0.0027292576419213972,
"grad_norm": 1.0288419723510742,
"learning_rate": 4.242424242424243e-06,
"loss": 1.6706295013427734,
"step": 15
},
{
"epoch": 0.00363901018922853,
"grad_norm": 2.129403829574585,
"learning_rate": 5.7575757575757586e-06,
"loss": 1.7363752365112304,
"step": 20
},
{
"epoch": 0.004548762736535662,
"grad_norm": 1.9468326568603516,
"learning_rate": 7.272727272727272e-06,
"loss": 1.7111135482788087,
"step": 25
},
{
"epoch": 0.0054585152838427945,
"grad_norm": 1.1269357204437256,
"learning_rate": 8.787878787878788e-06,
"loss": 1.6924203872680663,
"step": 30
},
{
"epoch": 0.006368267831149927,
"grad_norm": 1.4021248817443848,
"learning_rate": 1.0303030303030304e-05,
"loss": 1.658310317993164,
"step": 35
},
{
"epoch": 0.00727802037845706,
"grad_norm": 1.313381314277649,
"learning_rate": 1.1818181818181819e-05,
"loss": 1.5383296012878418,
"step": 40
},
{
"epoch": 0.008187772925764192,
"grad_norm": 2.4359891414642334,
"learning_rate": 1.3333333333333333e-05,
"loss": 1.4302565574645996,
"step": 45
},
{
"epoch": 0.009097525473071324,
"grad_norm": 1.6459542512893677,
"learning_rate": 1.484848484848485e-05,
"loss": 1.2602953910827637,
"step": 50
},
{
"epoch": 0.010007278020378457,
"grad_norm": 0.7953159213066101,
"learning_rate": 1.6363636363636366e-05,
"loss": 1.204326343536377,
"step": 55
},
{
"epoch": 0.010917030567685589,
"grad_norm": 0.5824465155601501,
"learning_rate": 1.787878787878788e-05,
"loss": 1.068561840057373,
"step": 60
},
{
"epoch": 0.011826783114992722,
"grad_norm": 0.39265626668930054,
"learning_rate": 1.9393939393939395e-05,
"loss": 0.9570062637329102,
"step": 65
},
{
"epoch": 0.012736535662299854,
"grad_norm": 0.3387283384799957,
"learning_rate": 2.090909090909091e-05,
"loss": 0.9454713821411133,
"step": 70
},
{
"epoch": 0.013646288209606987,
"grad_norm": 0.3182811141014099,
"learning_rate": 2.2424242424242424e-05,
"loss": 0.8901592254638672,
"step": 75
},
{
"epoch": 0.01455604075691412,
"grad_norm": 0.2735312879085541,
"learning_rate": 2.393939393939394e-05,
"loss": 0.8491583824157715,
"step": 80
},
{
"epoch": 0.015465793304221253,
"grad_norm": 0.2376435250043869,
"learning_rate": 2.5454545454545454e-05,
"loss": 0.8109179496765136,
"step": 85
},
{
"epoch": 0.016375545851528384,
"grad_norm": 0.2161586880683899,
"learning_rate": 2.696969696969697e-05,
"loss": 0.76962308883667,
"step": 90
},
{
"epoch": 0.017285298398835518,
"grad_norm": 0.19587980210781097,
"learning_rate": 2.8484848484848486e-05,
"loss": 0.7301986694335938,
"step": 95
},
{
"epoch": 0.018195050946142648,
"grad_norm": 0.20971694588661194,
"learning_rate": 3e-05,
"loss": 0.7269618034362793,
"step": 100
},
{
"epoch": 0.018195050946142648,
"eval_loss": 2.605874538421631,
"eval_runtime": 1120.0905,
"eval_samples_per_second": 33.935,
"eval_steps_per_second": 8.484,
"step": 100
},
{
"epoch": 0.01910480349344978,
"grad_norm": 0.10413152724504471,
"learning_rate": 3.151515151515151e-05,
"loss": 0.3250573635101318,
"step": 105
},
{
"epoch": 0.020014556040756915,
"grad_norm": 0.09383206814527512,
"learning_rate": 3.303030303030303e-05,
"loss": 0.3277724742889404,
"step": 110
},
{
"epoch": 0.020924308588064048,
"grad_norm": 0.1195850670337677,
"learning_rate": 3.454545454545455e-05,
"loss": 0.3215961217880249,
"step": 115
},
{
"epoch": 0.021834061135371178,
"grad_norm": 0.0715397521853447,
"learning_rate": 3.606060606060606e-05,
"loss": 0.3120795965194702,
"step": 120
},
{
"epoch": 0.02274381368267831,
"grad_norm": 0.068007692694664,
"learning_rate": 3.757575757575758e-05,
"loss": 0.2964257955551147,
"step": 125
},
{
"epoch": 0.023653566229985445,
"grad_norm": 0.09345484524965286,
"learning_rate": 3.909090909090909e-05,
"loss": 0.30776252746582033,
"step": 130
},
{
"epoch": 0.024563318777292575,
"grad_norm": 0.05577846243977547,
"learning_rate": 4.0606060606060606e-05,
"loss": 0.3180255889892578,
"step": 135
},
{
"epoch": 0.025473071324599708,
"grad_norm": 0.05919989198446274,
"learning_rate": 4.212121212121212e-05,
"loss": 0.31608285903930666,
"step": 140
},
{
"epoch": 0.02638282387190684,
"grad_norm": 0.05644674599170685,
"learning_rate": 4.3636363636363636e-05,
"loss": 0.2993780136108398,
"step": 145
},
{
"epoch": 0.027292576419213975,
"grad_norm": 0.059986088424921036,
"learning_rate": 4.515151515151516e-05,
"loss": 0.2931638479232788,
"step": 150
},
{
"epoch": 0.028202328966521105,
"grad_norm": 0.05941484495997429,
"learning_rate": 4.666666666666667e-05,
"loss": 0.29284651279449464,
"step": 155
},
{
"epoch": 0.02911208151382824,
"grad_norm": 0.0579044483602047,
"learning_rate": 4.8181818181818186e-05,
"loss": 0.2927037000656128,
"step": 160
},
{
"epoch": 0.030021834061135372,
"grad_norm": 0.061985693871974945,
"learning_rate": 4.9696969696969694e-05,
"loss": 0.28671720027923586,
"step": 165
},
{
"epoch": 0.030931586608442505,
"grad_norm": 0.05715535953640938,
"learning_rate": 4.999993064772809e-05,
"loss": 0.2817929744720459,
"step": 170
},
{
"epoch": 0.03184133915574964,
"grad_norm": 0.06549780815839767,
"learning_rate": 4.999964890478288e-05,
"loss": 0.27853829860687257,
"step": 175
},
{
"epoch": 0.03275109170305677,
"grad_norm": 0.05948757752776146,
"learning_rate": 4.999915043908795e-05,
"loss": 0.27522289752960205,
"step": 180
},
{
"epoch": 0.0336608442503639,
"grad_norm": 0.06262889504432678,
"learning_rate": 4.9998435254964515e-05,
"loss": 0.270997428894043,
"step": 185
},
{
"epoch": 0.034570596797671035,
"grad_norm": 0.06916829943656921,
"learning_rate": 4.999750335861253e-05,
"loss": 0.2788438558578491,
"step": 190
},
{
"epoch": 0.035480349344978165,
"grad_norm": 0.06128217652440071,
"learning_rate": 4.9996354758110624e-05,
"loss": 0.25649352073669435,
"step": 195
},
{
"epoch": 0.036390101892285295,
"grad_norm": 0.06704027950763702,
"learning_rate": 4.999498946341606e-05,
"loss": 0.25619523525238036,
"step": 200
},
{
"epoch": 0.03729985443959243,
"grad_norm": 0.061678580939769745,
"learning_rate": 4.999340748636462e-05,
"loss": 0.24956226348876953,
"step": 205
},
{
"epoch": 0.03820960698689956,
"grad_norm": 0.07328873127698898,
"learning_rate": 4.999160884067051e-05,
"loss": 0.26169676780700685,
"step": 210
},
{
"epoch": 0.0391193595342067,
"grad_norm": 0.08287990838289261,
"learning_rate": 4.9989593541926246e-05,
"loss": 0.2574604034423828,
"step": 215
},
{
"epoch": 0.04002911208151383,
"grad_norm": 0.06787359714508057,
"learning_rate": 4.9987361607602525e-05,
"loss": 0.25351409912109374,
"step": 220
},
{
"epoch": 0.04093886462882096,
"grad_norm": 0.06695502996444702,
"learning_rate": 4.998491305704805e-05,
"loss": 0.24522039890289307,
"step": 225
},
{
"epoch": 0.041848617176128096,
"grad_norm": 0.08872214704751968,
"learning_rate": 4.9982247911489375e-05,
"loss": 0.2581867933273315,
"step": 230
},
{
"epoch": 0.042758369723435226,
"grad_norm": 0.07637131959199905,
"learning_rate": 4.9979366194030743e-05,
"loss": 0.25569658279418944,
"step": 235
},
{
"epoch": 0.043668122270742356,
"grad_norm": 0.08158119022846222,
"learning_rate": 4.997626792965385e-05,
"loss": 0.2529409646987915,
"step": 240
},
{
"epoch": 0.04457787481804949,
"grad_norm": 0.07529161125421524,
"learning_rate": 4.997295314521766e-05,
"loss": 0.24049024581909179,
"step": 245
},
{
"epoch": 0.04548762736535662,
"grad_norm": 0.08860139548778534,
"learning_rate": 4.996942186945813e-05,
"loss": 0.2490522861480713,
"step": 250
},
{
"epoch": 0.04639737991266375,
"grad_norm": 0.0850321501493454,
"learning_rate": 4.9965674132988005e-05,
"loss": 0.24180831909179687,
"step": 255
},
{
"epoch": 0.04730713245997089,
"grad_norm": 0.07556115090847015,
"learning_rate": 4.996170996829653e-05,
"loss": 0.2509631872177124,
"step": 260
},
{
"epoch": 0.04821688500727802,
"grad_norm": 0.07971206307411194,
"learning_rate": 4.995752940974918e-05,
"loss": 0.24398891925811766,
"step": 265
},
{
"epoch": 0.04912663755458515,
"grad_norm": 0.09149336814880371,
"learning_rate": 4.9953132493587344e-05,
"loss": 0.2300492286682129,
"step": 270
},
{
"epoch": 0.050036390101892286,
"grad_norm": 0.08265820890665054,
"learning_rate": 4.9948519257928034e-05,
"loss": 0.24246792793273925,
"step": 275
},
{
"epoch": 0.050946142649199416,
"grad_norm": 0.10328587144613266,
"learning_rate": 4.9943689742763534e-05,
"loss": 0.2367171049118042,
"step": 280
},
{
"epoch": 0.05185589519650655,
"grad_norm": 0.0836917981505394,
"learning_rate": 4.993864398996105e-05,
"loss": 0.23215813636779786,
"step": 285
},
{
"epoch": 0.05276564774381368,
"grad_norm": 0.09475161135196686,
"learning_rate": 4.99333820432624e-05,
"loss": 0.2350748062133789,
"step": 290
},
{
"epoch": 0.05367540029112081,
"grad_norm": 0.08040128648281097,
"learning_rate": 4.992790394828355e-05,
"loss": 0.23253886699676513,
"step": 295
},
{
"epoch": 0.05458515283842795,
"grad_norm": 0.08852150291204453,
"learning_rate": 4.992220975251428e-05,
"loss": 0.23856515884399415,
"step": 300
},
{
"epoch": 0.05549490538573508,
"grad_norm": 0.09565229713916779,
"learning_rate": 4.991629950531775e-05,
"loss": 0.23311660289764405,
"step": 305
},
{
"epoch": 0.05640465793304221,
"grad_norm": 0.08158160001039505,
"learning_rate": 4.991017325793009e-05,
"loss": 0.22467944622039795,
"step": 310
},
{
"epoch": 0.05731441048034935,
"grad_norm": 0.07746429741382599,
"learning_rate": 4.990383106345994e-05,
"loss": 0.229844069480896,
"step": 315
},
{
"epoch": 0.05822416302765648,
"grad_norm": 0.08564355969429016,
"learning_rate": 4.989727297688797e-05,
"loss": 0.22414517402648926,
"step": 320
},
{
"epoch": 0.05913391557496361,
"grad_norm": 0.07517435401678085,
"learning_rate": 4.9890499055066435e-05,
"loss": 0.2236532211303711,
"step": 325
},
{
"epoch": 0.060043668122270744,
"grad_norm": 0.111734539270401,
"learning_rate": 4.988350935671869e-05,
"loss": 0.21474847793579102,
"step": 330
},
{
"epoch": 0.060953420669577874,
"grad_norm": 0.09906989336013794,
"learning_rate": 4.987630394243866e-05,
"loss": 0.23321933746337892,
"step": 335
},
{
"epoch": 0.06186317321688501,
"grad_norm": 0.10131457448005676,
"learning_rate": 4.98688828746903e-05,
"loss": 0.2310662031173706,
"step": 340
},
{
"epoch": 0.06277292576419213,
"grad_norm": 0.09203507006168365,
"learning_rate": 4.986124621780708e-05,
"loss": 0.22021169662475587,
"step": 345
},
{
"epoch": 0.06368267831149928,
"grad_norm": 0.09505912661552429,
"learning_rate": 4.9853394037991416e-05,
"loss": 0.2197155237197876,
"step": 350
},
{
"epoch": 0.06459243085880641,
"grad_norm": 0.09038657695055008,
"learning_rate": 4.984532640331412e-05,
"loss": 0.22066287994384765,
"step": 355
},
{
"epoch": 0.06550218340611354,
"grad_norm": 0.09707064181566238,
"learning_rate": 4.9837043383713753e-05,
"loss": 0.22455451488494874,
"step": 360
},
{
"epoch": 0.06641193595342067,
"grad_norm": 0.10367228090763092,
"learning_rate": 4.98285450509961e-05,
"loss": 0.21993820667266845,
"step": 365
},
{
"epoch": 0.0673216885007278,
"grad_norm": 0.12229471653699875,
"learning_rate": 4.9819831478833456e-05,
"loss": 0.2168867588043213,
"step": 370
},
{
"epoch": 0.06823144104803494,
"grad_norm": 0.0964592918753624,
"learning_rate": 4.981090274276406e-05,
"loss": 0.21579203605651856,
"step": 375
},
{
"epoch": 0.06914119359534207,
"grad_norm": 0.09400496631860733,
"learning_rate": 4.980175892019141e-05,
"loss": 0.20972180366516113,
"step": 380
},
{
"epoch": 0.0700509461426492,
"grad_norm": 0.08158645778894424,
"learning_rate": 4.9792400090383594e-05,
"loss": 0.22148358821868896,
"step": 385
},
{
"epoch": 0.07096069868995633,
"grad_norm": 0.10916394740343094,
"learning_rate": 4.978282633447261e-05,
"loss": 0.2214418649673462,
"step": 390
},
{
"epoch": 0.07187045123726346,
"grad_norm": 0.11138810962438583,
"learning_rate": 4.9773037735453636e-05,
"loss": 0.21814754009246826,
"step": 395
},
{
"epoch": 0.07278020378457059,
"grad_norm": 0.10914396494626999,
"learning_rate": 4.9763034378184365e-05,
"loss": 0.21310818195343018,
"step": 400
},
{
"epoch": 0.07368995633187773,
"grad_norm": 0.1043366864323616,
"learning_rate": 4.975281634938421e-05,
"loss": 0.21266789436340333,
"step": 405
},
{
"epoch": 0.07459970887918486,
"grad_norm": 0.1036868542432785,
"learning_rate": 4.9742383737633594e-05,
"loss": 0.21606721878051757,
"step": 410
},
{
"epoch": 0.075509461426492,
"grad_norm": 0.11640442907810211,
"learning_rate": 4.9731736633373144e-05,
"loss": 0.21532948017120362,
"step": 415
},
{
"epoch": 0.07641921397379912,
"grad_norm": 0.11219926178455353,
"learning_rate": 4.9720875128902956e-05,
"loss": 0.2191627025604248,
"step": 420
},
{
"epoch": 0.07732896652110625,
"grad_norm": 0.12103637307882309,
"learning_rate": 4.970979931838176e-05,
"loss": 0.20938868522644044,
"step": 425
},
{
"epoch": 0.0782387190684134,
"grad_norm": 0.13274189829826355,
"learning_rate": 4.96985092978261e-05,
"loss": 0.21792960166931152,
"step": 430
},
{
"epoch": 0.07914847161572053,
"grad_norm": 0.11164513230323792,
"learning_rate": 4.968700516510954e-05,
"loss": 0.2022618055343628,
"step": 435
},
{
"epoch": 0.08005822416302766,
"grad_norm": 0.09532847255468369,
"learning_rate": 4.967528701996174e-05,
"loss": 0.21255812644958497,
"step": 440
},
{
"epoch": 0.08096797671033479,
"grad_norm": 0.10279258340597153,
"learning_rate": 4.96633549639677e-05,
"loss": 0.20683050155639648,
"step": 445
},
{
"epoch": 0.08187772925764192,
"grad_norm": 0.1257462352514267,
"learning_rate": 4.965120910056677e-05,
"loss": 0.21419920921325683,
"step": 450
},
{
"epoch": 0.08278748180494905,
"grad_norm": 0.11663137376308441,
"learning_rate": 4.963884953505186e-05,
"loss": 0.2072287082672119,
"step": 455
},
{
"epoch": 0.08369723435225619,
"grad_norm": 0.10488224029541016,
"learning_rate": 4.96262763745684e-05,
"loss": 0.1982678532600403,
"step": 460
},
{
"epoch": 0.08460698689956332,
"grad_norm": 0.11801692098379135,
"learning_rate": 4.961348972811354e-05,
"loss": 0.20662031173706055,
"step": 465
},
{
"epoch": 0.08551673944687045,
"grad_norm": 0.11318827420473099,
"learning_rate": 4.96004897065351e-05,
"loss": 0.20947303771972656,
"step": 470
},
{
"epoch": 0.08642649199417758,
"grad_norm": 0.13409486413002014,
"learning_rate": 4.95872764225307e-05,
"loss": 0.19670876264572143,
"step": 475
},
{
"epoch": 0.08733624454148471,
"grad_norm": 0.14440792798995972,
"learning_rate": 4.957384999064672e-05,
"loss": 0.19842848777770997,
"step": 480
},
{
"epoch": 0.08824599708879186,
"grad_norm": 0.12246996909379959,
"learning_rate": 4.956021052727731e-05,
"loss": 0.20318071842193602,
"step": 485
},
{
"epoch": 0.08915574963609899,
"grad_norm": 0.13437233865261078,
"learning_rate": 4.954635815066342e-05,
"loss": 0.21675212383270265,
"step": 490
},
{
"epoch": 0.09006550218340612,
"grad_norm": 0.11109672486782074,
"learning_rate": 4.9532292980891744e-05,
"loss": 0.2100757837295532,
"step": 495
},
{
"epoch": 0.09097525473071325,
"grad_norm": 0.1388893872499466,
"learning_rate": 4.9518015139893675e-05,
"loss": 0.20303285121917725,
"step": 500
},
{
"epoch": 0.09188500727802038,
"grad_norm": 0.13239721953868866,
"learning_rate": 4.950352475144427e-05,
"loss": 0.2152268409729004,
"step": 505
},
{
"epoch": 0.0927947598253275,
"grad_norm": 0.12834979593753815,
"learning_rate": 4.948882194116119e-05,
"loss": 0.20799248218536376,
"step": 510
},
{
"epoch": 0.09370451237263465,
"grad_norm": 0.11886704713106155,
"learning_rate": 4.947390683650354e-05,
"loss": 0.20394976139068605,
"step": 515
},
{
"epoch": 0.09461426491994178,
"grad_norm": 0.11398876458406448,
"learning_rate": 4.945877956677083e-05,
"loss": 0.2091092586517334,
"step": 520
},
{
"epoch": 0.09552401746724891,
"grad_norm": 0.1422540694475174,
"learning_rate": 4.944344026310186e-05,
"loss": 0.19564238786697388,
"step": 525
},
{
"epoch": 0.09643377001455604,
"grad_norm": 0.11359584331512451,
"learning_rate": 4.9427889058473535e-05,
"loss": 0.20493624210357667,
"step": 530
},
{
"epoch": 0.09734352256186317,
"grad_norm": 0.11703553050756454,
"learning_rate": 4.941212608769974e-05,
"loss": 0.2098615884780884,
"step": 535
},
{
"epoch": 0.0982532751091703,
"grad_norm": 0.14552047848701477,
"learning_rate": 4.939615148743017e-05,
"loss": 0.20382182598114013,
"step": 540
},
{
"epoch": 0.09916302765647744,
"grad_norm": 0.13178016245365143,
"learning_rate": 4.937996539614914e-05,
"loss": 0.19901862144470214,
"step": 545
},
{
"epoch": 0.10007278020378457,
"grad_norm": 0.635392427444458,
"learning_rate": 4.936356795417439e-05,
"loss": 0.20694944858551026,
"step": 550
},
{
"epoch": 0.1009825327510917,
"grad_norm": 0.15019077062606812,
"learning_rate": 4.934695930365586e-05,
"loss": 0.19313746690750122,
"step": 555
},
{
"epoch": 0.10189228529839883,
"grad_norm": 0.12941956520080566,
"learning_rate": 4.9330139588574474e-05,
"loss": 0.19671722650527954,
"step": 560
},
{
"epoch": 0.10280203784570596,
"grad_norm": 0.13818831741809845,
"learning_rate": 4.931310895474088e-05,
"loss": 0.20026786327362062,
"step": 565
},
{
"epoch": 0.1037117903930131,
"grad_norm": 0.12011194974184036,
"learning_rate": 4.929586754979417e-05,
"loss": 0.1932437539100647,
"step": 570
},
{
"epoch": 0.10462154294032024,
"grad_norm": 0.1345364898443222,
"learning_rate": 4.9278415523200644e-05,
"loss": 0.20245940685272218,
"step": 575
},
{
"epoch": 0.10553129548762737,
"grad_norm": 0.13281017541885376,
"learning_rate": 4.926075302625247e-05,
"loss": 0.19864981174468993,
"step": 580
},
{
"epoch": 0.1064410480349345,
"grad_norm": 0.13465586304664612,
"learning_rate": 4.924288021206639e-05,
"loss": 0.19573183059692384,
"step": 585
},
{
"epoch": 0.10735080058224163,
"grad_norm": 0.15225961804389954,
"learning_rate": 4.9224797235582396e-05,
"loss": 0.19946801662445068,
"step": 590
},
{
"epoch": 0.10826055312954876,
"grad_norm": 0.12816746532917023,
"learning_rate": 4.92065042535624e-05,
"loss": 0.19851526021957397,
"step": 595
},
{
"epoch": 0.1091703056768559,
"grad_norm": 0.13802853226661682,
"learning_rate": 4.9188001424588824e-05,
"loss": 0.19321763515472412,
"step": 600
},
{
"epoch": 0.11008005822416303,
"grad_norm": 0.17504797875881195,
"learning_rate": 4.9169288909063295e-05,
"loss": 0.2032616138458252,
"step": 605
},
{
"epoch": 0.11098981077147016,
"grad_norm": 0.13544194400310516,
"learning_rate": 4.91503668692052e-05,
"loss": 0.2011256456375122,
"step": 610
},
{
"epoch": 0.11189956331877729,
"grad_norm": 1.3976134061813354,
"learning_rate": 4.91312354690503e-05,
"loss": 0.19916868209838867,
"step": 615
},
{
"epoch": 0.11280931586608442,
"grad_norm": 0.1465059071779251,
"learning_rate": 4.91118948744493e-05,
"loss": 0.19487457275390624,
"step": 620
},
{
"epoch": 0.11371906841339156,
"grad_norm": 0.12103168666362762,
"learning_rate": 4.909234525306645e-05,
"loss": 0.1907251238822937,
"step": 625
},
{
"epoch": 0.1146288209606987,
"grad_norm": 0.12660574913024902,
"learning_rate": 4.907258677437802e-05,
"loss": 0.19327253103256226,
"step": 630
},
{
"epoch": 0.11553857350800582,
"grad_norm": 0.1347813606262207,
"learning_rate": 4.90526196096709e-05,
"loss": 0.19637736082077026,
"step": 635
},
{
"epoch": 0.11644832605531295,
"grad_norm": 0.14953652024269104,
"learning_rate": 4.903244393204107e-05,
"loss": 0.20325069427490233,
"step": 640
},
{
"epoch": 0.11735807860262008,
"grad_norm": 0.13936272263526917,
"learning_rate": 4.901205991639213e-05,
"loss": 0.1930275321006775,
"step": 645
},
{
"epoch": 0.11826783114992721,
"grad_norm": 0.1448420137166977,
"learning_rate": 4.899146773943374e-05,
"loss": 0.20026936531066894,
"step": 650
},
{
"epoch": 0.11917758369723436,
"grad_norm": 0.1312534064054489,
"learning_rate": 4.897066757968014e-05,
"loss": 0.19062033891677857,
"step": 655
},
{
"epoch": 0.12008733624454149,
"grad_norm": 0.13644742965698242,
"learning_rate": 4.894965961744859e-05,
"loss": 0.18719595670700073,
"step": 660
},
{
"epoch": 0.12099708879184862,
"grad_norm": 0.14276087284088135,
"learning_rate": 4.892844403485777e-05,
"loss": 0.19784307479858398,
"step": 665
},
{
"epoch": 0.12190684133915575,
"grad_norm": 0.14735399186611176,
"learning_rate": 4.890702101582623e-05,
"loss": 0.19163782596588136,
"step": 670
},
{
"epoch": 0.12281659388646288,
"grad_norm": 0.15742065012454987,
"learning_rate": 4.888539074607082e-05,
"loss": 0.19312986135482788,
"step": 675
},
{
"epoch": 0.12372634643377002,
"grad_norm": 0.12917031347751617,
"learning_rate": 4.8863553413105025e-05,
"loss": 0.20066320896148682,
"step": 680
},
{
"epoch": 0.12463609898107715,
"grad_norm": 0.1484801322221756,
"learning_rate": 4.884150920623737e-05,
"loss": 0.20096096992492676,
"step": 685
},
{
"epoch": 0.12554585152838427,
"grad_norm": 0.1455296128988266,
"learning_rate": 4.88192583165698e-05,
"loss": 0.20518505573272705,
"step": 690
},
{
"epoch": 0.12645560407569142,
"grad_norm": 0.14517490565776825,
"learning_rate": 4.879680093699598e-05,
"loss": 0.18859238624572755,
"step": 695
},
{
"epoch": 0.12736535662299855,
"grad_norm": 0.18778090178966522,
"learning_rate": 4.877413726219964e-05,
"loss": 0.197074818611145,
"step": 700
},
{
"epoch": 0.12827510917030568,
"grad_norm": 0.13497677445411682,
"learning_rate": 4.87512674886529e-05,
"loss": 0.18713107109069824,
"step": 705
},
{
"epoch": 0.12918486171761281,
"grad_norm": 0.12657155096530914,
"learning_rate": 4.872819181461455e-05,
"loss": 0.1858484387397766,
"step": 710
},
{
"epoch": 0.13009461426491994,
"grad_norm": 0.11458148807287216,
"learning_rate": 4.870491044012834e-05,
"loss": 0.18732179403305055,
"step": 715
},
{
"epoch": 0.13100436681222707,
"grad_norm": 0.13000249862670898,
"learning_rate": 4.8681423567021244e-05,
"loss": 0.1872936010360718,
"step": 720
},
{
"epoch": 0.1319141193595342,
"grad_norm": 0.14580890536308289,
"learning_rate": 4.865773139890172e-05,
"loss": 0.19280019998550416,
"step": 725
},
{
"epoch": 0.13282387190684133,
"grad_norm": 0.1507277935743332,
"learning_rate": 4.8633834141157913e-05,
"loss": 0.1898929238319397,
"step": 730
},
{
"epoch": 0.13373362445414846,
"grad_norm": 0.1418737769126892,
"learning_rate": 4.860973200095592e-05,
"loss": 0.17926375865936278,
"step": 735
},
{
"epoch": 0.1346433770014556,
"grad_norm": 0.17151866853237152,
"learning_rate": 4.858542518723794e-05,
"loss": 0.18963592052459716,
"step": 740
},
{
"epoch": 0.13555312954876272,
"grad_norm": 0.11162743717432022,
"learning_rate": 4.8560913910720535e-05,
"loss": 0.19466646909713745,
"step": 745
},
{
"epoch": 0.13646288209606988,
"grad_norm": 0.15628376603126526,
"learning_rate": 4.8536198383892725e-05,
"loss": 0.19494034051895143,
"step": 750
},
{
"epoch": 0.137372634643377,
"grad_norm": 0.18209289014339447,
"learning_rate": 4.851127882101421e-05,
"loss": 0.18747550249099731,
"step": 755
},
{
"epoch": 0.13828238719068414,
"grad_norm": 0.14559614658355713,
"learning_rate": 4.8486155438113454e-05,
"loss": 0.1897158980369568,
"step": 760
},
{
"epoch": 0.13919213973799127,
"grad_norm": 0.3198587894439697,
"learning_rate": 4.846082845298586e-05,
"loss": 0.18571001291275024,
"step": 765
},
{
"epoch": 0.1401018922852984,
"grad_norm": 0.1486678421497345,
"learning_rate": 4.843529808519189e-05,
"loss": 0.19561930894851684,
"step": 770
},
{
"epoch": 0.14101164483260553,
"grad_norm": 0.15318170189857483,
"learning_rate": 4.840956455605509e-05,
"loss": 0.187040114402771,
"step": 775
},
{
"epoch": 0.14192139737991266,
"grad_norm": 0.13754244148731232,
"learning_rate": 4.838362808866025e-05,
"loss": 0.18345539569854735,
"step": 780
},
{
"epoch": 0.1428311499272198,
"grad_norm": 0.12943248450756073,
"learning_rate": 4.835748890785143e-05,
"loss": 0.1921079397201538,
"step": 785
},
{
"epoch": 0.14374090247452692,
"grad_norm": 0.110458143055439,
"learning_rate": 4.833114724023001e-05,
"loss": 0.17927205562591553,
"step": 790
},
{
"epoch": 0.14465065502183405,
"grad_norm": 0.2421770840883255,
"learning_rate": 4.830460331415275e-05,
"loss": 0.18317567110061644,
"step": 795
},
{
"epoch": 0.14556040756914118,
"grad_norm": 0.14752762019634247,
"learning_rate": 4.8277857359729787e-05,
"loss": 0.1843916058540344,
"step": 800
},
{
"epoch": 0.14647016011644834,
"grad_norm": 0.15043556690216064,
"learning_rate": 4.8250909608822644e-05,
"loss": 0.18354393243789674,
"step": 805
},
{
"epoch": 0.14737991266375547,
"grad_norm": 0.1381794661283493,
"learning_rate": 4.822376029504223e-05,
"loss": 0.1789781332015991,
"step": 810
},
{
"epoch": 0.1482896652110626,
"grad_norm": 0.18386174738407135,
"learning_rate": 4.819640965374681e-05,
"loss": 0.19494292736053467,
"step": 815
},
{
"epoch": 0.14919941775836973,
"grad_norm": 0.13829593360424042,
"learning_rate": 4.816885792203996e-05,
"loss": 0.18486063480377196,
"step": 820
},
{
"epoch": 0.15010917030567686,
"grad_norm": 0.15033291280269623,
"learning_rate": 4.814110533876852e-05,
"loss": 0.18061509132385253,
"step": 825
},
{
"epoch": 0.151018922852984,
"grad_norm": 0.17150473594665527,
"learning_rate": 4.811315214452051e-05,
"loss": 0.18464866876602173,
"step": 830
},
{
"epoch": 0.15192867540029112,
"grad_norm": 0.15317125618457794,
"learning_rate": 4.808499858162307e-05,
"loss": 0.1837708592414856,
"step": 835
},
{
"epoch": 0.15283842794759825,
"grad_norm": 0.2671392560005188,
"learning_rate": 4.805664489414031e-05,
"loss": 0.19338636398315429,
"step": 840
},
{
"epoch": 0.15374818049490538,
"grad_norm": 0.14047028124332428,
"learning_rate": 4.802809132787125e-05,
"loss": 0.17069108486175538,
"step": 845
},
{
"epoch": 0.1546579330422125,
"grad_norm": 0.1520431935787201,
"learning_rate": 4.799933813034768e-05,
"loss": 0.18607735633850098,
"step": 850
},
{
"epoch": 0.15556768558951964,
"grad_norm": 0.17239463329315186,
"learning_rate": 4.797038555083197e-05,
"loss": 0.18069062232971192,
"step": 855
},
{
"epoch": 0.1564774381368268,
"grad_norm": 0.1377955675125122,
"learning_rate": 4.794123384031495e-05,
"loss": 0.18870222568511963,
"step": 860
},
{
"epoch": 0.15738719068413393,
"grad_norm": 0.15901461243629456,
"learning_rate": 4.791188325151373e-05,
"loss": 0.18128334283828734,
"step": 865
},
{
"epoch": 0.15829694323144106,
"grad_norm": 0.14634132385253906,
"learning_rate": 4.7882334038869495e-05,
"loss": 0.1866163969039917,
"step": 870
},
{
"epoch": 0.1592066957787482,
"grad_norm": 0.15361061692237854,
"learning_rate": 4.785258645854529e-05,
"loss": 0.17850807905197144,
"step": 875
},
{
"epoch": 0.16011644832605532,
"grad_norm": 0.13751649856567383,
"learning_rate": 4.782264076842385e-05,
"loss": 0.17731113433837892,
"step": 880
},
{
"epoch": 0.16102620087336245,
"grad_norm": 0.17909638583660126,
"learning_rate": 4.7792497228105314e-05,
"loss": 0.18344542980194092,
"step": 885
},
{
"epoch": 0.16193595342066958,
"grad_norm": 0.16038304567337036,
"learning_rate": 4.776215609890498e-05,
"loss": 0.18868647813796996,
"step": 890
},
{
"epoch": 0.1628457059679767,
"grad_norm": 0.1653951108455658,
"learning_rate": 4.773161764385107e-05,
"loss": 0.18614152669906617,
"step": 895
},
{
"epoch": 0.16375545851528384,
"grad_norm": 0.16193026304244995,
"learning_rate": 4.770088212768241e-05,
"loss": 0.18564575910568237,
"step": 900
},
{
"epoch": 0.16466521106259097,
"grad_norm": 0.16048531234264374,
"learning_rate": 4.7669949816846173e-05,
"loss": 0.18330031633377075,
"step": 905
},
{
"epoch": 0.1655749636098981,
"grad_norm": 0.1440177708864212,
"learning_rate": 4.7638820979495534e-05,
"loss": 0.17712442874908446,
"step": 910
},
{
"epoch": 0.16648471615720525,
"grad_norm": 0.19635969400405884,
"learning_rate": 4.760749588548738e-05,
"loss": 0.18679027557373046,
"step": 915
},
{
"epoch": 0.16739446870451238,
"grad_norm": 0.15576541423797607,
"learning_rate": 4.757597480637995e-05,
"loss": 0.19283764362335204,
"step": 920
},
{
"epoch": 0.1683042212518195,
"grad_norm": 0.1550331562757492,
"learning_rate": 4.7544258015430463e-05,
"loss": 0.18269542455673218,
"step": 925
},
{
"epoch": 0.16921397379912664,
"grad_norm": 0.18369626998901367,
"learning_rate": 4.75123457875928e-05,
"loss": 0.1697891116142273,
"step": 930
},
{
"epoch": 0.17012372634643377,
"grad_norm": 0.15266314148902893,
"learning_rate": 4.7480238399515074e-05,
"loss": 0.18523451089859008,
"step": 935
},
{
"epoch": 0.1710334788937409,
"grad_norm": 0.16709664463996887,
"learning_rate": 4.744793612953724e-05,
"loss": 0.1803238034248352,
"step": 940
},
{
"epoch": 0.17194323144104803,
"grad_norm": 0.14929179847240448,
"learning_rate": 4.741543925768872e-05,
"loss": 0.1861217737197876,
"step": 945
},
{
"epoch": 0.17285298398835516,
"grad_norm": 0.1362280696630478,
"learning_rate": 4.7382748065685915e-05,
"loss": 0.17896100282669067,
"step": 950
},
{
"epoch": 0.1737627365356623,
"grad_norm": 0.15290239453315735,
"learning_rate": 4.734986283692982e-05,
"loss": 0.18432788848876952,
"step": 955
},
{
"epoch": 0.17467248908296942,
"grad_norm": 0.1287035197019577,
"learning_rate": 4.73167838565035e-05,
"loss": 0.18485682010650634,
"step": 960
},
{
"epoch": 0.17558224163027655,
"grad_norm": 0.17969627678394318,
"learning_rate": 4.728351141116971e-05,
"loss": 0.17361557483673096,
"step": 965
},
{
"epoch": 0.1764919941775837,
"grad_norm": 0.13751201331615448,
"learning_rate": 4.7250045789368326e-05,
"loss": 0.1731679320335388,
"step": 970
},
{
"epoch": 0.17740174672489084,
"grad_norm": 0.1603265255689621,
"learning_rate": 4.721638728121388e-05,
"loss": 0.17308170795440675,
"step": 975
},
{
"epoch": 0.17831149927219797,
"grad_norm": 0.1592789888381958,
"learning_rate": 4.718253617849306e-05,
"loss": 0.17534757852554322,
"step": 980
},
{
"epoch": 0.1792212518195051,
"grad_norm": 0.12727224826812744,
"learning_rate": 4.714849277466214e-05,
"loss": 0.17817609310150145,
"step": 985
},
{
"epoch": 0.18013100436681223,
"grad_norm": 0.15401554107666016,
"learning_rate": 4.711425736484447e-05,
"loss": 0.1733405351638794,
"step": 990
},
{
"epoch": 0.18104075691411936,
"grad_norm": 0.13253968954086304,
"learning_rate": 4.7079830245827906e-05,
"loss": 0.17846795320510864,
"step": 995
},
{
"epoch": 0.1819505094614265,
"grad_norm": 0.21846213936805725,
"learning_rate": 4.7045211716062245e-05,
"loss": 0.18021599054336548,
"step": 1000
},
{
"epoch": 0.18286026200873362,
"grad_norm": 0.16867990791797638,
"learning_rate": 4.7010402075656595e-05,
"loss": 0.18232386112213134,
"step": 1005
},
{
"epoch": 0.18377001455604075,
"grad_norm": 0.17180582880973816,
"learning_rate": 4.697540162637686e-05,
"loss": 0.1816317319869995,
"step": 1010
},
{
"epoch": 0.18467976710334788,
"grad_norm": 0.16480213403701782,
"learning_rate": 4.694021067164303e-05,
"loss": 0.17718446254730225,
"step": 1015
},
{
"epoch": 0.185589519650655,
"grad_norm": 0.15015918016433716,
"learning_rate": 4.6904829516526605e-05,
"loss": 0.17412011623382567,
"step": 1020
},
{
"epoch": 0.18649927219796217,
"grad_norm": 0.14445139467716217,
"learning_rate": 4.686925846774795e-05,
"loss": 0.1778018832206726,
"step": 1025
},
{
"epoch": 0.1874090247452693,
"grad_norm": 0.1701960265636444,
"learning_rate": 4.683349783367362e-05,
"loss": 0.16901081800460815,
"step": 1030
},
{
"epoch": 0.18831877729257643,
"grad_norm": 0.15894867479801178,
"learning_rate": 4.679754792431368e-05,
"loss": 0.17055928707122803,
"step": 1035
},
{
"epoch": 0.18922852983988356,
"grad_norm": 0.1511942446231842,
"learning_rate": 4.676140905131903e-05,
"loss": 0.17339680194854737,
"step": 1040
},
{
"epoch": 0.1901382823871907,
"grad_norm": 0.14735209941864014,
"learning_rate": 4.672508152797872e-05,
"loss": 0.17802717685699462,
"step": 1045
},
{
"epoch": 0.19104803493449782,
"grad_norm": 0.17367291450500488,
"learning_rate": 4.66885656692172e-05,
"loss": 0.1732744097709656,
"step": 1050
},
{
"epoch": 0.19195778748180495,
"grad_norm": 0.147227481007576,
"learning_rate": 4.665186179159159e-05,
"loss": 0.17040517330169677,
"step": 1055
},
{
"epoch": 0.19286754002911208,
"grad_norm": 0.1709655076265335,
"learning_rate": 4.6614970213289e-05,
"loss": 0.17794088125228882,
"step": 1060
},
{
"epoch": 0.1937772925764192,
"grad_norm": 0.1588088721036911,
"learning_rate": 4.657789125412366e-05,
"loss": 0.17180380821228028,
"step": 1065
},
{
"epoch": 0.19468704512372634,
"grad_norm": 0.14827021956443787,
"learning_rate": 4.654062523553428e-05,
"loss": 0.182997989654541,
"step": 1070
},
{
"epoch": 0.19559679767103347,
"grad_norm": 0.16230466961860657,
"learning_rate": 4.6503172480581126e-05,
"loss": 0.17346880435943604,
"step": 1075
},
{
"epoch": 0.1965065502183406,
"grad_norm": 0.1637624353170395,
"learning_rate": 4.646553331394333e-05,
"loss": 0.17263576984405518,
"step": 1080
},
{
"epoch": 0.19741630276564776,
"grad_norm": 0.15977843105793,
"learning_rate": 4.642770806191603e-05,
"loss": 0.17284308671951293,
"step": 1085
},
{
"epoch": 0.19832605531295489,
"grad_norm": 0.15394869446754456,
"learning_rate": 4.6389697052407534e-05,
"loss": 0.17797101736068727,
"step": 1090
},
{
"epoch": 0.19923580786026202,
"grad_norm": 0.15995225310325623,
"learning_rate": 4.6351500614936485e-05,
"loss": 0.18137198686599731,
"step": 1095
},
{
"epoch": 0.20014556040756915,
"grad_norm": 0.1779479682445526,
"learning_rate": 4.6313119080629006e-05,
"loss": 0.17998344898223878,
"step": 1100
},
{
"epoch": 0.20105531295487628,
"grad_norm": 0.14362832903862,
"learning_rate": 4.627455278221584e-05,
"loss": 0.18196423053741456,
"step": 1105
},
{
"epoch": 0.2019650655021834,
"grad_norm": 0.15951639413833618,
"learning_rate": 4.623580205402947e-05,
"loss": 0.17423888444900512,
"step": 1110
},
{
"epoch": 0.20287481804949054,
"grad_norm": 0.17273563146591187,
"learning_rate": 4.619686723200115e-05,
"loss": 0.17392473220825194,
"step": 1115
},
{
"epoch": 0.20378457059679767,
"grad_norm": 0.1655360758304596,
"learning_rate": 4.615774865365813e-05,
"loss": 0.17528389692306517,
"step": 1120
},
{
"epoch": 0.2046943231441048,
"grad_norm": 0.15920691192150116,
"learning_rate": 4.611844665812058e-05,
"loss": 0.1806849241256714,
"step": 1125
},
{
"epoch": 0.20560407569141192,
"grad_norm": 0.16114577651023865,
"learning_rate": 4.607896158609875e-05,
"loss": 0.17217352390289306,
"step": 1130
},
{
"epoch": 0.20651382823871905,
"grad_norm": 0.1499422937631607,
"learning_rate": 4.603929377988999e-05,
"loss": 0.17806737422943114,
"step": 1135
},
{
"epoch": 0.2074235807860262,
"grad_norm": 0.17605191469192505,
"learning_rate": 4.5999443583375765e-05,
"loss": 0.17842113971710205,
"step": 1140
},
{
"epoch": 0.20833333333333334,
"grad_norm": 0.16117210686206818,
"learning_rate": 4.595941134201871e-05,
"loss": 0.18379683494567872,
"step": 1145
},
{
"epoch": 0.20924308588064047,
"grad_norm": 0.21199050545692444,
"learning_rate": 4.591919740285957e-05,
"loss": 0.16286123991012574,
"step": 1150
},
{
"epoch": 0.2101528384279476,
"grad_norm": 0.15100529789924622,
"learning_rate": 4.587880211451427e-05,
"loss": 0.17995200157165528,
"step": 1155
},
{
"epoch": 0.21106259097525473,
"grad_norm": 0.16618172824382782,
"learning_rate": 4.583822582717085e-05,
"loss": 0.16960303783416747,
"step": 1160
},
{
"epoch": 0.21197234352256186,
"grad_norm": 0.14743569493293762,
"learning_rate": 4.579746889258643e-05,
"loss": 0.17762668132781984,
"step": 1165
},
{
"epoch": 0.212882096069869,
"grad_norm": 0.1697179079055786,
"learning_rate": 4.575653166408417e-05,
"loss": 0.16656005382537842,
"step": 1170
},
{
"epoch": 0.21379184861717612,
"grad_norm": 0.14886513352394104,
"learning_rate": 4.57154144965502e-05,
"loss": 0.17091882228851318,
"step": 1175
},
{
"epoch": 0.21470160116448325,
"grad_norm": 0.18197473883628845,
"learning_rate": 4.5674117746430556e-05,
"loss": 0.1770920753479004,
"step": 1180
},
{
"epoch": 0.21561135371179038,
"grad_norm": 0.17323088645935059,
"learning_rate": 4.563264177172807e-05,
"loss": 0.1734643578529358,
"step": 1185
},
{
"epoch": 0.2165211062590975,
"grad_norm": 0.1521984338760376,
"learning_rate": 4.559098693199929e-05,
"loss": 0.17515116930007935,
"step": 1190
},
{
"epoch": 0.21743085880640467,
"grad_norm": 0.1842304915189743,
"learning_rate": 4.554915358835134e-05,
"loss": 0.16798022985458375,
"step": 1195
},
{
"epoch": 0.2183406113537118,
"grad_norm": 0.14753451943397522,
"learning_rate": 4.5507142103438794e-05,
"loss": 0.1755476713180542,
"step": 1200
},
{
"epoch": 0.21925036390101893,
"grad_norm": 0.17096194624900818,
"learning_rate": 4.546495284146057e-05,
"loss": 0.1792473554611206,
"step": 1205
},
{
"epoch": 0.22016011644832606,
"grad_norm": 0.1579233556985855,
"learning_rate": 4.542258616815669e-05,
"loss": 0.17230144739151002,
"step": 1210
},
{
"epoch": 0.2210698689956332,
"grad_norm": 0.177297905087471,
"learning_rate": 4.5380042450805216e-05,
"loss": 0.1807127833366394,
"step": 1215
},
{
"epoch": 0.22197962154294032,
"grad_norm": 0.14331696927547455,
"learning_rate": 4.533732205821897e-05,
"loss": 0.17201389074325563,
"step": 1220
},
{
"epoch": 0.22288937409024745,
"grad_norm": 0.14473360776901245,
"learning_rate": 4.529442536074239e-05,
"loss": 0.17036900520324708,
"step": 1225
},
{
"epoch": 0.22379912663755458,
"grad_norm": 0.1820901483297348,
"learning_rate": 4.5251352730248314e-05,
"loss": 0.17704882621765136,
"step": 1230
},
{
"epoch": 0.2247088791848617,
"grad_norm": 0.1948976367712021,
"learning_rate": 4.5208104540134746e-05,
"loss": 0.1706973433494568,
"step": 1235
},
{
"epoch": 0.22561863173216884,
"grad_norm": 0.16660070419311523,
"learning_rate": 4.51646811653216e-05,
"loss": 0.17636821269989014,
"step": 1240
},
{
"epoch": 0.22652838427947597,
"grad_norm": 0.1699984073638916,
"learning_rate": 4.512108298224751e-05,
"loss": 0.16986632347106934,
"step": 1245
},
{
"epoch": 0.22743813682678313,
"grad_norm": 0.17601042985916138,
"learning_rate": 4.50773103688665e-05,
"loss": 0.17507898807525635,
"step": 1250
},
{
"epoch": 0.22834788937409026,
"grad_norm": 0.17557238042354584,
"learning_rate": 4.503336370464476e-05,
"loss": 0.17702863216400147,
"step": 1255
},
{
"epoch": 0.2292576419213974,
"grad_norm": 0.1800651252269745,
"learning_rate": 4.498924337055729e-05,
"loss": 0.16419180631637573,
"step": 1260
},
{
"epoch": 0.23016739446870452,
"grad_norm": 0.2022479772567749,
"learning_rate": 4.494494974908468e-05,
"loss": 0.17482060194015503,
"step": 1265
},
{
"epoch": 0.23107714701601165,
"grad_norm": 0.14180205762386322,
"learning_rate": 4.490048322420973e-05,
"loss": 0.1723136067390442,
"step": 1270
},
{
"epoch": 0.23198689956331878,
"grad_norm": 0.18607310950756073,
"learning_rate": 4.485584418141419e-05,
"loss": 0.17096419334411622,
"step": 1275
},
{
"epoch": 0.2328966521106259,
"grad_norm": 0.15958310663700104,
"learning_rate": 4.481103300767529e-05,
"loss": 0.1656244158744812,
"step": 1280
},
{
"epoch": 0.23380640465793304,
"grad_norm": 0.17552383244037628,
"learning_rate": 4.476605009146255e-05,
"loss": 0.17677626609802247,
"step": 1285
},
{
"epoch": 0.23471615720524017,
"grad_norm": 0.15299823880195618,
"learning_rate": 4.472089582273429e-05,
"loss": 0.1778991103172302,
"step": 1290
},
{
"epoch": 0.2356259097525473,
"grad_norm": 0.14613987505435944,
"learning_rate": 4.46755705929343e-05,
"loss": 0.17071452140808105,
"step": 1295
},
{
"epoch": 0.23653566229985443,
"grad_norm": 0.17781122028827667,
"learning_rate": 4.463007479498843e-05,
"loss": 0.16955430507659913,
"step": 1300
},
{
"epoch": 0.23744541484716158,
"grad_norm": 0.16326487064361572,
"learning_rate": 4.458440882330119e-05,
"loss": 0.1777693510055542,
"step": 1305
},
{
"epoch": 0.23835516739446871,
"grad_norm": 0.17701926827430725,
"learning_rate": 4.4538573073752365e-05,
"loss": 0.16323351860046387,
"step": 1310
},
{
"epoch": 0.23926491994177584,
"grad_norm": 0.13104717433452606,
"learning_rate": 4.449256794369349e-05,
"loss": 0.17653456926345826,
"step": 1315
},
{
"epoch": 0.24017467248908297,
"grad_norm": 0.1796836256980896,
"learning_rate": 4.444639383194452e-05,
"loss": 0.17189600467681884,
"step": 1320
},
{
"epoch": 0.2410844250363901,
"grad_norm": 0.14919696748256683,
"learning_rate": 4.440005113879029e-05,
"loss": 0.17003334760665895,
"step": 1325
},
{
"epoch": 0.24199417758369723,
"grad_norm": 0.1728784441947937,
"learning_rate": 4.4353540265977064e-05,
"loss": 0.17397408485412597,
"step": 1330
},
{
"epoch": 0.24290393013100436,
"grad_norm": 0.14591015875339508,
"learning_rate": 4.43068616167091e-05,
"loss": 0.16498478651046752,
"step": 1335
},
{
"epoch": 0.2438136826783115,
"grad_norm": 0.18417201936244965,
"learning_rate": 4.4260015595645055e-05,
"loss": 0.16841750144958495,
"step": 1340
},
{
"epoch": 0.24472343522561862,
"grad_norm": 0.16264279186725616,
"learning_rate": 4.4213002608894605e-05,
"loss": 0.16907373666763306,
"step": 1345
},
{
"epoch": 0.24563318777292575,
"grad_norm": 0.15248481929302216,
"learning_rate": 4.416582306401481e-05,
"loss": 0.15931472778320313,
"step": 1350
},
{
"epoch": 0.24654294032023288,
"grad_norm": 0.1488373875617981,
"learning_rate": 4.4118477370006636e-05,
"loss": 0.1701716423034668,
"step": 1355
},
{
"epoch": 0.24745269286754004,
"grad_norm": 0.14679782092571259,
"learning_rate": 4.407096593731142e-05,
"loss": 0.157412326335907,
"step": 1360
},
{
"epoch": 0.24836244541484717,
"grad_norm": 0.17139530181884766,
"learning_rate": 4.402328917780728e-05,
"loss": 0.17303754091262818,
"step": 1365
},
{
"epoch": 0.2492721979621543,
"grad_norm": 0.1534871757030487,
"learning_rate": 4.397544750480554e-05,
"loss": 0.1786255121231079,
"step": 1370
},
{
"epoch": 0.2501819505094614,
"grad_norm": 0.1876252293586731,
"learning_rate": 4.39274413330472e-05,
"loss": 0.16442898511886597,
"step": 1375
},
{
"epoch": 0.25109170305676853,
"grad_norm": 0.16165752708911896,
"learning_rate": 4.387927107869928e-05,
"loss": 0.1780426025390625,
"step": 1380
},
{
"epoch": 0.25200145560407566,
"grad_norm": 0.17242255806922913,
"learning_rate": 4.383093715935124e-05,
"loss": 0.15959256887435913,
"step": 1385
},
{
"epoch": 0.25291120815138285,
"grad_norm": 0.1627114862203598,
"learning_rate": 4.378243999401137e-05,
"loss": 0.17606115341186523,
"step": 1390
},
{
"epoch": 0.25382096069869,
"grad_norm": 0.15911224484443665,
"learning_rate": 4.373378000310312e-05,
"loss": 0.16798585653305054,
"step": 1395
},
{
"epoch": 0.2547307132459971,
"grad_norm": 0.15542249381542206,
"learning_rate": 4.3684957608461505e-05,
"loss": 0.1695417881011963,
"step": 1400
},
{
"epoch": 0.25564046579330424,
"grad_norm": 0.1475304812192917,
"learning_rate": 4.363597323332941e-05,
"loss": 0.16340878009796142,
"step": 1405
},
{
"epoch": 0.25655021834061137,
"grad_norm": 0.16943927109241486,
"learning_rate": 4.358682730235395e-05,
"loss": 0.17240238189697266,
"step": 1410
},
{
"epoch": 0.2574599708879185,
"grad_norm": 0.1816391944885254,
"learning_rate": 4.3537520241582744e-05,
"loss": 0.16558437347412108,
"step": 1415
},
{
"epoch": 0.25836972343522563,
"grad_norm": 0.23851341009140015,
"learning_rate": 4.348805247846027e-05,
"loss": 0.16796000003814698,
"step": 1420
},
{
"epoch": 0.25927947598253276,
"grad_norm": 0.15415243804454803,
"learning_rate": 4.343842444182414e-05,
"loss": 0.1746017098426819,
"step": 1425
},
{
"epoch": 0.2601892285298399,
"grad_norm": 0.15651032328605652,
"learning_rate": 4.338863656190139e-05,
"loss": 0.1649057984352112,
"step": 1430
},
{
"epoch": 0.261098981077147,
"grad_norm": 0.16601966321468353,
"learning_rate": 4.333868927030471e-05,
"loss": 0.15888988971710205,
"step": 1435
},
{
"epoch": 0.26200873362445415,
"grad_norm": 0.1549467295408249,
"learning_rate": 4.328858300002876e-05,
"loss": 0.16357985734939576,
"step": 1440
},
{
"epoch": 0.2629184861717613,
"grad_norm": 0.16332370042800903,
"learning_rate": 4.32383181854464e-05,
"loss": 0.16749982833862304,
"step": 1445
},
{
"epoch": 0.2638282387190684,
"grad_norm": 0.14827077090740204,
"learning_rate": 4.3187895262304894e-05,
"loss": 0.16886214017868043,
"step": 1450
},
{
"epoch": 0.26473799126637554,
"grad_norm": 0.1557198166847229,
"learning_rate": 4.313731466772216e-05,
"loss": 0.17512214183807373,
"step": 1455
},
{
"epoch": 0.26564774381368267,
"grad_norm": 0.17263570427894592,
"learning_rate": 4.308657684018299e-05,
"loss": 0.16248074769973755,
"step": 1460
},
{
"epoch": 0.2665574963609898,
"grad_norm": 0.17135761678218842,
"learning_rate": 4.303568221953521e-05,
"loss": 0.16605921983718872,
"step": 1465
},
{
"epoch": 0.26746724890829693,
"grad_norm": 0.14322632551193237,
"learning_rate": 4.2984631246985897e-05,
"loss": 0.1610772728919983,
"step": 1470
},
{
"epoch": 0.26837700145560406,
"grad_norm": 0.18852312862873077,
"learning_rate": 4.2933424365097564e-05,
"loss": 0.1686462163925171,
"step": 1475
},
{
"epoch": 0.2692867540029112,
"grad_norm": 0.1780245155096054,
"learning_rate": 4.2882062017784294e-05,
"loss": 0.16953932046890258,
"step": 1480
},
{
"epoch": 0.2701965065502183,
"grad_norm": 0.180568665266037,
"learning_rate": 4.2830544650307895e-05,
"loss": 0.16442664861679077,
"step": 1485
},
{
"epoch": 0.27110625909752545,
"grad_norm": 0.16876435279846191,
"learning_rate": 4.277887270927407e-05,
"loss": 0.17128173112869263,
"step": 1490
},
{
"epoch": 0.2720160116448326,
"grad_norm": 0.164053276181221,
"learning_rate": 4.2727046642628513e-05,
"loss": 0.16331382989883422,
"step": 1495
},
{
"epoch": 0.27292576419213976,
"grad_norm": 0.14577528834342957,
"learning_rate": 4.267506689965305e-05,
"loss": 0.1638316035270691,
"step": 1500
},
{
"epoch": 0.2738355167394469,
"grad_norm": 0.1648740917444229,
"learning_rate": 4.262293393096171e-05,
"loss": 0.15332664251327516,
"step": 1505
},
{
"epoch": 0.274745269286754,
"grad_norm": 0.16445094347000122,
"learning_rate": 4.257064818849685e-05,
"loss": 0.1706634521484375,
"step": 1510
},
{
"epoch": 0.27565502183406115,
"grad_norm": 0.1584935486316681,
"learning_rate": 4.251821012552524e-05,
"loss": 0.1684114694595337,
"step": 1515
},
{
"epoch": 0.2765647743813683,
"grad_norm": 0.17215611040592194,
"learning_rate": 4.24656201966341e-05,
"loss": 0.15594131946563722,
"step": 1520
},
{
"epoch": 0.2774745269286754,
"grad_norm": 0.15945589542388916,
"learning_rate": 4.2412878857727214e-05,
"loss": 0.1686659574508667,
"step": 1525
},
{
"epoch": 0.27838427947598254,
"grad_norm": 0.16103951632976532,
"learning_rate": 4.2359986566020906e-05,
"loss": 0.17779340744018554,
"step": 1530
},
{
"epoch": 0.2792940320232897,
"grad_norm": 0.1770307570695877,
"learning_rate": 4.230694378004014e-05,
"loss": 0.16786882877349854,
"step": 1535
},
{
"epoch": 0.2802037845705968,
"grad_norm": 0.16225053369998932,
"learning_rate": 4.2253750959614504e-05,
"loss": 0.16558897495269775,
"step": 1540
},
{
"epoch": 0.28111353711790393,
"grad_norm": 0.27213969826698303,
"learning_rate": 4.220040856587425e-05,
"loss": 0.1641119599342346,
"step": 1545
},
{
"epoch": 0.28202328966521106,
"grad_norm": 0.1773071587085724,
"learning_rate": 4.2146917061246284e-05,
"loss": 0.16919140815734862,
"step": 1550
},
{
"epoch": 0.2829330422125182,
"grad_norm": 0.15519705414772034,
"learning_rate": 4.209327690945014e-05,
"loss": 0.15501506328582765,
"step": 1555
},
{
"epoch": 0.2838427947598253,
"grad_norm": 0.19921597838401794,
"learning_rate": 4.203948857549402e-05,
"loss": 0.1690821886062622,
"step": 1560
},
{
"epoch": 0.28475254730713245,
"grad_norm": 0.15417630970478058,
"learning_rate": 4.1985552525670696e-05,
"loss": 0.1675640344619751,
"step": 1565
},
{
"epoch": 0.2856622998544396,
"grad_norm": 0.1739572137594223,
"learning_rate": 4.193146922755348e-05,
"loss": 0.16738017797470092,
"step": 1570
},
{
"epoch": 0.2865720524017467,
"grad_norm": 0.1384361982345581,
"learning_rate": 4.187723914999221e-05,
"loss": 0.16802358627319336,
"step": 1575
},
{
"epoch": 0.28748180494905384,
"grad_norm": 0.1491454839706421,
"learning_rate": 4.182286276310915e-05,
"loss": 0.1619583249092102,
"step": 1580
},
{
"epoch": 0.288391557496361,
"grad_norm": 0.15831919014453888,
"learning_rate": 4.176834053829492e-05,
"loss": 0.1625199794769287,
"step": 1585
},
{
"epoch": 0.2893013100436681,
"grad_norm": 0.16265396773815155,
"learning_rate": 4.1713672948204416e-05,
"loss": 0.16718552112579346,
"step": 1590
},
{
"epoch": 0.29021106259097523,
"grad_norm": 0.15153461694717407,
"learning_rate": 4.1658860466752714e-05,
"loss": 0.15979087352752686,
"step": 1595
},
{
"epoch": 0.29112081513828236,
"grad_norm": 0.1620412915945053,
"learning_rate": 4.160390356911096e-05,
"loss": 0.16103557348251343,
"step": 1600
},
{
"epoch": 0.2920305676855895,
"grad_norm": 0.16673807799816132,
"learning_rate": 4.154880273170223e-05,
"loss": 0.16394708156585694,
"step": 1605
},
{
"epoch": 0.2929403202328967,
"grad_norm": 0.14834867417812347,
"learning_rate": 4.149355843219744e-05,
"loss": 0.15916435718536376,
"step": 1610
},
{
"epoch": 0.2938500727802038,
"grad_norm": 0.16977964341640472,
"learning_rate": 4.143817114951119e-05,
"loss": 0.16538127660751342,
"step": 1615
},
{
"epoch": 0.29475982532751094,
"grad_norm": 0.17986875772476196,
"learning_rate": 4.138264136379756e-05,
"loss": 0.15514618158340454,
"step": 1620
},
{
"epoch": 0.29566957787481807,
"grad_norm": 0.15794920921325684,
"learning_rate": 4.132696955644605e-05,
"loss": 0.15992183685302735,
"step": 1625
},
{
"epoch": 0.2965793304221252,
"grad_norm": 0.19955399632453918,
"learning_rate": 4.127115621007731e-05,
"loss": 0.16362056732177735,
"step": 1630
},
{
"epoch": 0.29748908296943233,
"grad_norm": 0.1352023035287857,
"learning_rate": 4.121520180853903e-05,
"loss": 0.15631601810455323,
"step": 1635
},
{
"epoch": 0.29839883551673946,
"grad_norm": 0.15340781211853027,
"learning_rate": 4.1159106836901674e-05,
"loss": 0.1571858048439026,
"step": 1640
},
{
"epoch": 0.2993085880640466,
"grad_norm": 0.15311770141124725,
"learning_rate": 4.110287178145433e-05,
"loss": 0.16082344055175782,
"step": 1645
},
{
"epoch": 0.3002183406113537,
"grad_norm": 0.17811627686023712,
"learning_rate": 4.10464971297005e-05,
"loss": 0.16117215156555176,
"step": 1650
},
{
"epoch": 0.30112809315866085,
"grad_norm": 0.21060039103031158,
"learning_rate": 4.0989983370353805e-05,
"loss": 0.15838587284088135,
"step": 1655
},
{
"epoch": 0.302037845705968,
"grad_norm": 0.155836820602417,
"learning_rate": 4.093333099333383e-05,
"loss": 0.16648870706558228,
"step": 1660
},
{
"epoch": 0.3029475982532751,
"grad_norm": 0.13711698353290558,
"learning_rate": 4.0876540489761826e-05,
"loss": 0.16899349689483642,
"step": 1665
},
{
"epoch": 0.30385735080058224,
"grad_norm": 0.15162716805934906,
"learning_rate": 4.0819612351956485e-05,
"loss": 0.16574090719223022,
"step": 1670
},
{
"epoch": 0.30476710334788937,
"grad_norm": 0.15016348659992218,
"learning_rate": 4.0762547073429615e-05,
"loss": 0.1689780354499817,
"step": 1675
},
{
"epoch": 0.3056768558951965,
"grad_norm": 0.15182986855506897,
"learning_rate": 4.070534514888194e-05,
"loss": 0.1593686819076538,
"step": 1680
},
{
"epoch": 0.3065866084425036,
"grad_norm": 0.15648750960826874,
"learning_rate": 4.0648007074198765e-05,
"loss": 0.16436235904693602,
"step": 1685
},
{
"epoch": 0.30749636098981076,
"grad_norm": 0.18339484930038452,
"learning_rate": 4.0590533346445665e-05,
"loss": 0.1678077220916748,
"step": 1690
},
{
"epoch": 0.3084061135371179,
"grad_norm": 0.16426527500152588,
"learning_rate": 4.053292446386422e-05,
"loss": 0.1689227342605591,
"step": 1695
},
{
"epoch": 0.309315866084425,
"grad_norm": 0.16129335761070251,
"learning_rate": 4.047518092586766e-05,
"loss": 0.16592445373535156,
"step": 1700
},
{
"epoch": 0.31022561863173215,
"grad_norm": 0.15512363612651825,
"learning_rate": 4.041730323303654e-05,
"loss": 0.16142364740371704,
"step": 1705
},
{
"epoch": 0.3111353711790393,
"grad_norm": 0.159842386841774,
"learning_rate": 4.0359291887114425e-05,
"loss": 0.1702875852584839,
"step": 1710
},
{
"epoch": 0.3120451237263464,
"grad_norm": 0.19558854401111603,
"learning_rate": 4.030114739100352e-05,
"loss": 0.15966148376464845,
"step": 1715
},
{
"epoch": 0.3129548762736536,
"grad_norm": 0.1577496975660324,
"learning_rate": 4.024287024876029e-05,
"loss": 0.1620358943939209,
"step": 1720
},
{
"epoch": 0.3138646288209607,
"grad_norm": 0.1629355251789093,
"learning_rate": 4.0184460965591144e-05,
"loss": 0.16511552333831786,
"step": 1725
},
{
"epoch": 0.31477438136826785,
"grad_norm": 0.17060767114162445,
"learning_rate": 4.0125920047848e-05,
"loss": 0.15672838687896729,
"step": 1730
},
{
"epoch": 0.315684133915575,
"grad_norm": 0.22447620332241058,
"learning_rate": 4.006724800302394e-05,
"loss": 0.15339784622192382,
"step": 1735
},
{
"epoch": 0.3165938864628821,
"grad_norm": 0.14572037756443024,
"learning_rate": 4.000844533974878e-05,
"loss": 0.16566959619522095,
"step": 1740
},
{
"epoch": 0.31750363901018924,
"grad_norm": 0.15915483236312866,
"learning_rate": 3.9949512567784684e-05,
"loss": 0.16153957843780517,
"step": 1745
},
{
"epoch": 0.3184133915574964,
"grad_norm": 0.1668540984392166,
"learning_rate": 3.9890450198021704e-05,
"loss": 0.1659809947013855,
"step": 1750
},
{
"epoch": 0.3193231441048035,
"grad_norm": 0.16612035036087036,
"learning_rate": 3.983125874247341e-05,
"loss": 0.16941241025924683,
"step": 1755
},
{
"epoch": 0.32023289665211063,
"grad_norm": 0.15163679420948029,
"learning_rate": 3.9771938714272407e-05,
"loss": 0.16053590774536133,
"step": 1760
},
{
"epoch": 0.32114264919941776,
"grad_norm": 0.1797824203968048,
"learning_rate": 3.97124906276659e-05,
"loss": 0.1667110800743103,
"step": 1765
},
{
"epoch": 0.3220524017467249,
"grad_norm": 0.15076608955860138,
"learning_rate": 3.9652914998011237e-05,
"loss": 0.1607860803604126,
"step": 1770
},
{
"epoch": 0.322962154294032,
"grad_norm": 0.16523587703704834,
"learning_rate": 3.959321234177144e-05,
"loss": 0.16515827178955078,
"step": 1775
},
{
"epoch": 0.32387190684133915,
"grad_norm": 0.22065149247646332,
"learning_rate": 3.9533383176510746e-05,
"loss": 0.1618957757949829,
"step": 1780
},
{
"epoch": 0.3247816593886463,
"grad_norm": 0.16426463425159454,
"learning_rate": 3.9473428020890066e-05,
"loss": 0.15763382911682128,
"step": 1785
},
{
"epoch": 0.3256914119359534,
"grad_norm": 0.16474904119968414,
"learning_rate": 3.941334739466257e-05,
"loss": 0.15135571956634522,
"step": 1790
},
{
"epoch": 0.32660116448326054,
"grad_norm": 0.16746412217617035,
"learning_rate": 3.935314181866909e-05,
"loss": 0.15925389528274536,
"step": 1795
},
{
"epoch": 0.32751091703056767,
"grad_norm": 0.17819371819496155,
"learning_rate": 3.929281181483369e-05,
"loss": 0.1598669171333313,
"step": 1800
},
{
"epoch": 0.3284206695778748,
"grad_norm": 0.1816040277481079,
"learning_rate": 3.923235790615907e-05,
"loss": 0.1652522087097168,
"step": 1805
},
{
"epoch": 0.32933042212518193,
"grad_norm": 0.14846695959568024,
"learning_rate": 3.917178061672211e-05,
"loss": 0.16665585041046144,
"step": 1810
},
{
"epoch": 0.33024017467248906,
"grad_norm": 0.1734926551580429,
"learning_rate": 3.911108047166924e-05,
"loss": 0.16069791316986085,
"step": 1815
},
{
"epoch": 0.3311499272197962,
"grad_norm": 0.16154922544956207,
"learning_rate": 3.905025799721194e-05,
"loss": 0.16114097833633423,
"step": 1820
},
{
"epoch": 0.3320596797671033,
"grad_norm": 0.1538771390914917,
"learning_rate": 3.898931372062217e-05,
"loss": 0.1602831244468689,
"step": 1825
},
{
"epoch": 0.3329694323144105,
"grad_norm": 0.14036566019058228,
"learning_rate": 3.892824817022781e-05,
"loss": 0.1502395749092102,
"step": 1830
},
{
"epoch": 0.33387918486171764,
"grad_norm": 0.19212059676647186,
"learning_rate": 3.886706187540804e-05,
"loss": 0.16265250444412233,
"step": 1835
},
{
"epoch": 0.33478893740902477,
"grad_norm": 0.17410333454608917,
"learning_rate": 3.880575536658881e-05,
"loss": 0.15689224004745483,
"step": 1840
},
{
"epoch": 0.3356986899563319,
"grad_norm": 0.15165294706821442,
"learning_rate": 3.874432917523817e-05,
"loss": 0.15033140182495117,
"step": 1845
},
{
"epoch": 0.336608442503639,
"grad_norm": 0.16166730225086212,
"learning_rate": 3.8682783833861736e-05,
"loss": 0.16896235942840576,
"step": 1850
},
{
"epoch": 0.33751819505094616,
"grad_norm": 0.16497021913528442,
"learning_rate": 3.8621119875998026e-05,
"loss": 0.1600774645805359,
"step": 1855
},
{
"epoch": 0.3384279475982533,
"grad_norm": 0.17264948785305023,
"learning_rate": 3.855933783621384e-05,
"loss": 0.16947593688964843,
"step": 1860
},
{
"epoch": 0.3393377001455604,
"grad_norm": 0.16870704293251038,
"learning_rate": 3.8497438250099636e-05,
"loss": 0.16062095165252685,
"step": 1865
},
{
"epoch": 0.34024745269286755,
"grad_norm": 0.16644036769866943,
"learning_rate": 3.843542165426492e-05,
"loss": 0.16015599966049193,
"step": 1870
},
{
"epoch": 0.3411572052401747,
"grad_norm": 0.1626352220773697,
"learning_rate": 3.837328858633349e-05,
"loss": 0.17444703578948975,
"step": 1875
},
{
"epoch": 0.3420669577874818,
"grad_norm": 0.1427375227212906,
"learning_rate": 3.83110395849389e-05,
"loss": 0.1589805006980896,
"step": 1880
},
{
"epoch": 0.34297671033478894,
"grad_norm": 0.17840255796909332,
"learning_rate": 3.824867518971973e-05,
"loss": 0.15953952074050903,
"step": 1885
},
{
"epoch": 0.34388646288209607,
"grad_norm": 0.16998249292373657,
"learning_rate": 3.818619594131489e-05,
"loss": 0.16027032136917113,
"step": 1890
},
{
"epoch": 0.3447962154294032,
"grad_norm": 0.14950257539749146,
"learning_rate": 3.812360238135897e-05,
"loss": 0.15335670709609986,
"step": 1895
},
{
"epoch": 0.3457059679767103,
"grad_norm": 0.1678011417388916,
"learning_rate": 3.806089505247752e-05,
"loss": 0.1560648798942566,
"step": 1900
},
{
"epoch": 0.34661572052401746,
"grad_norm": 0.17944541573524475,
"learning_rate": 3.799807449828238e-05,
"loss": 0.16072254180908202,
"step": 1905
},
{
"epoch": 0.3475254730713246,
"grad_norm": 0.166817307472229,
"learning_rate": 3.793514126336691e-05,
"loss": 0.1542820692062378,
"step": 1910
},
{
"epoch": 0.3484352256186317,
"grad_norm": 0.16047626733779907,
"learning_rate": 3.787209589330134e-05,
"loss": 0.16092092990875245,
"step": 1915
},
{
"epoch": 0.34934497816593885,
"grad_norm": 0.16478900611400604,
"learning_rate": 3.7808938934627965e-05,
"loss": 0.16765867471694945,
"step": 1920
},
{
"epoch": 0.350254730713246,
"grad_norm": 0.15349514782428741,
"learning_rate": 3.774567093485648e-05,
"loss": 0.15890377759933472,
"step": 1925
},
{
"epoch": 0.3511644832605531,
"grad_norm": 0.1515921950340271,
"learning_rate": 3.768229244245917e-05,
"loss": 0.16668319702148438,
"step": 1930
},
{
"epoch": 0.35207423580786024,
"grad_norm": 0.16310466825962067,
"learning_rate": 3.7618804006866195e-05,
"loss": 0.15182652473449706,
"step": 1935
},
{
"epoch": 0.3529839883551674,
"grad_norm": 0.17294517159461975,
"learning_rate": 3.755520617846084e-05,
"loss": 0.16287628412246705,
"step": 1940
},
{
"epoch": 0.35389374090247455,
"grad_norm": 0.1482895463705063,
"learning_rate": 3.749149950857467e-05,
"loss": 0.15321952104568481,
"step": 1945
},
{
"epoch": 0.3548034934497817,
"grad_norm": 0.2236029952764511,
"learning_rate": 3.7427684549482847e-05,
"loss": 0.15403482913970948,
"step": 1950
},
{
"epoch": 0.3557132459970888,
"grad_norm": 0.20185327529907227,
"learning_rate": 3.736376185439927e-05,
"loss": 0.1633884072303772,
"step": 1955
},
{
"epoch": 0.35662299854439594,
"grad_norm": 0.13906247913837433,
"learning_rate": 3.7299731977471816e-05,
"loss": 0.15925350189208984,
"step": 1960
},
{
"epoch": 0.35753275109170307,
"grad_norm": 0.18665002286434174,
"learning_rate": 3.723559547377751e-05,
"loss": 0.1612026572227478,
"step": 1965
},
{
"epoch": 0.3584425036390102,
"grad_norm": 0.16913433372974396,
"learning_rate": 3.717135289931774e-05,
"loss": 0.15479494333267213,
"step": 1970
},
{
"epoch": 0.35935225618631733,
"grad_norm": 0.1620066910982132,
"learning_rate": 3.7107004811013434e-05,
"loss": 0.1604058027267456,
"step": 1975
},
{
"epoch": 0.36026200873362446,
"grad_norm": 0.16838301718235016,
"learning_rate": 3.704255176670021e-05,
"loss": 0.15335073471069335,
"step": 1980
},
{
"epoch": 0.3611717612809316,
"grad_norm": 0.3054695427417755,
"learning_rate": 3.6977994325123535e-05,
"loss": 0.16558053493499755,
"step": 1985
},
{
"epoch": 0.3620815138282387,
"grad_norm": 0.1526716649532318,
"learning_rate": 3.6913333045933934e-05,
"loss": 0.16148923635482787,
"step": 1990
},
{
"epoch": 0.36299126637554585,
"grad_norm": 0.15328513085842133,
"learning_rate": 3.684856848968209e-05,
"loss": 0.1553613781929016,
"step": 1995
},
{
"epoch": 0.363901018922853,
"grad_norm": 0.16129714250564575,
"learning_rate": 3.6783701217813995e-05,
"loss": 0.16724612712860107,
"step": 2000
}
],
"logging_steps": 5,
"max_steps": 5500,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.1054573765554867e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}