GreekTTS-1.5 / checkpoint-264000 /trainer_state.json
moiraai2024's picture
Uploading final model weights
95a8061 verified
raw
history blame contribute delete
58.8 kB
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.8540145985401457,
"eval_steps": 500,
"global_step": 264000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01167883211678832,
"grad_norm": 0.7469853758811951,
"learning_rate": 0.00019976817349031748,
"loss": 4.9807,
"step": 800
},
{
"epoch": 0.02335766423357664,
"grad_norm": 1.120423436164856,
"learning_rate": 0.00019953459514303025,
"loss": 4.6712,
"step": 1600
},
{
"epoch": 0.035036496350364967,
"grad_norm": 1.064866542816162,
"learning_rate": 0.00019930101679574305,
"loss": 4.5512,
"step": 2400
},
{
"epoch": 0.04671532846715328,
"grad_norm": 0.8409781455993652,
"learning_rate": 0.00019906743844845583,
"loss": 4.4947,
"step": 3200
},
{
"epoch": 0.058394160583941604,
"grad_norm": 1.03812575340271,
"learning_rate": 0.00019883386010116863,
"loss": 4.4487,
"step": 4000
},
{
"epoch": 0.07007299270072993,
"grad_norm": 0.6089211106300354,
"learning_rate": 0.00019860028175388143,
"loss": 4.4197,
"step": 4800
},
{
"epoch": 0.08175182481751825,
"grad_norm": 1.2314237356185913,
"learning_rate": 0.00019836670340659423,
"loss": 4.3931,
"step": 5600
},
{
"epoch": 0.09343065693430656,
"grad_norm": 0.6221948862075806,
"learning_rate": 0.000198133125059307,
"loss": 4.3595,
"step": 6400
},
{
"epoch": 0.10510948905109489,
"grad_norm": 0.7354845404624939,
"learning_rate": 0.0001978995467120198,
"loss": 4.3489,
"step": 7200
},
{
"epoch": 0.11678832116788321,
"grad_norm": 0.47563284635543823,
"learning_rate": 0.00019766596836473258,
"loss": 4.3167,
"step": 8000
},
{
"epoch": 0.12846715328467154,
"grad_norm": 0.5970898270606995,
"learning_rate": 0.0001974323900174454,
"loss": 4.3059,
"step": 8800
},
{
"epoch": 0.14014598540145987,
"grad_norm": 1.0531948804855347,
"learning_rate": 0.0001971988116701582,
"loss": 4.294,
"step": 9600
},
{
"epoch": 0.15182481751824817,
"grad_norm": 0.6124999523162842,
"learning_rate": 0.000196965233322871,
"loss": 4.288,
"step": 10400
},
{
"epoch": 0.1635036496350365,
"grad_norm": 0.43388164043426514,
"learning_rate": 0.00019673165497558376,
"loss": 4.273,
"step": 11200
},
{
"epoch": 0.17518248175182483,
"grad_norm": 1.2883777618408203,
"learning_rate": 0.00019649807662829656,
"loss": 4.2696,
"step": 12000
},
{
"epoch": 0.18686131386861313,
"grad_norm": 1.0502655506134033,
"learning_rate": 0.00019626449828100937,
"loss": 4.2584,
"step": 12800
},
{
"epoch": 0.19854014598540146,
"grad_norm": 1.0939903259277344,
"learning_rate": 0.00019603091993372217,
"loss": 4.2377,
"step": 13600
},
{
"epoch": 0.21021897810218979,
"grad_norm": 0.732036828994751,
"learning_rate": 0.00019579734158643494,
"loss": 4.2274,
"step": 14400
},
{
"epoch": 0.22189781021897811,
"grad_norm": 0.7493735551834106,
"learning_rate": 0.00019556376323914774,
"loss": 4.2382,
"step": 15200
},
{
"epoch": 0.23357664233576642,
"grad_norm": 0.7022084593772888,
"learning_rate": 0.00019533018489186052,
"loss": 4.241,
"step": 16000
},
{
"epoch": 0.24525547445255474,
"grad_norm": 0.6069856882095337,
"learning_rate": 0.00019509660654457335,
"loss": 4.2072,
"step": 16800
},
{
"epoch": 0.2569343065693431,
"grad_norm": 0.9912208318710327,
"learning_rate": 0.00019486302819728612,
"loss": 4.2133,
"step": 17600
},
{
"epoch": 0.2686131386861314,
"grad_norm": 0.6228162050247192,
"learning_rate": 0.00019462944984999892,
"loss": 4.2074,
"step": 18400
},
{
"epoch": 0.28029197080291973,
"grad_norm": 0.6577867269515991,
"learning_rate": 0.0001943958715027117,
"loss": 4.1989,
"step": 19200
},
{
"epoch": 0.291970802919708,
"grad_norm": 0.5231236815452576,
"learning_rate": 0.0001941622931554245,
"loss": 4.1921,
"step": 20000
},
{
"epoch": 0.30364963503649633,
"grad_norm": 0.7894455194473267,
"learning_rate": 0.0001939287148081373,
"loss": 4.1899,
"step": 20800
},
{
"epoch": 0.31532846715328466,
"grad_norm": 0.7088421583175659,
"learning_rate": 0.0001936951364608501,
"loss": 4.1795,
"step": 21600
},
{
"epoch": 0.327007299270073,
"grad_norm": 0.7622804045677185,
"learning_rate": 0.00019346155811356288,
"loss": 4.1789,
"step": 22400
},
{
"epoch": 0.3386861313868613,
"grad_norm": 0.41267886757850647,
"learning_rate": 0.00019322797976627568,
"loss": 4.1695,
"step": 23200
},
{
"epoch": 0.35036496350364965,
"grad_norm": 0.6643211245536804,
"learning_rate": 0.00019299440141898848,
"loss": 4.1714,
"step": 24000
},
{
"epoch": 0.362043795620438,
"grad_norm": 1.1012191772460938,
"learning_rate": 0.00019276082307170128,
"loss": 4.1677,
"step": 24800
},
{
"epoch": 0.37372262773722625,
"grad_norm": 0.5746153593063354,
"learning_rate": 0.00019252724472441405,
"loss": 4.1463,
"step": 25600
},
{
"epoch": 0.3854014598540146,
"grad_norm": 0.6068096160888672,
"learning_rate": 0.00019229366637712686,
"loss": 4.1643,
"step": 26400
},
{
"epoch": 0.3970802919708029,
"grad_norm": 0.4222017228603363,
"learning_rate": 0.00019206008802983963,
"loss": 4.1627,
"step": 27200
},
{
"epoch": 0.40875912408759124,
"grad_norm": 1.108417272567749,
"learning_rate": 0.00019182650968255246,
"loss": 4.1555,
"step": 28000
},
{
"epoch": 0.42043795620437957,
"grad_norm": 0.847973644733429,
"learning_rate": 0.00019159293133526523,
"loss": 4.1604,
"step": 28800
},
{
"epoch": 0.4321167883211679,
"grad_norm": 0.7368318438529968,
"learning_rate": 0.00019135935298797804,
"loss": 4.1497,
"step": 29600
},
{
"epoch": 0.44379562043795623,
"grad_norm": 1.041403889656067,
"learning_rate": 0.0001911257746406908,
"loss": 4.1417,
"step": 30400
},
{
"epoch": 0.4554744525547445,
"grad_norm": 0.6244940757751465,
"learning_rate": 0.0001908921962934036,
"loss": 4.1642,
"step": 31200
},
{
"epoch": 0.46715328467153283,
"grad_norm": 0.871951162815094,
"learning_rate": 0.0001906586179461164,
"loss": 4.1358,
"step": 32000
},
{
"epoch": 0.47883211678832116,
"grad_norm": 1.751654028892517,
"learning_rate": 0.00019042503959882921,
"loss": 4.1393,
"step": 32800
},
{
"epoch": 0.4905109489051095,
"grad_norm": 0.8029406070709229,
"learning_rate": 0.000190191461251542,
"loss": 4.132,
"step": 33600
},
{
"epoch": 0.5021897810218978,
"grad_norm": 1.084448218345642,
"learning_rate": 0.0001899578829042548,
"loss": 4.1323,
"step": 34400
},
{
"epoch": 0.5138686131386861,
"grad_norm": 0.8661244511604309,
"learning_rate": 0.00018972430455696756,
"loss": 4.1388,
"step": 35200
},
{
"epoch": 0.5255474452554745,
"grad_norm": 0.7281563878059387,
"learning_rate": 0.0001894907262096804,
"loss": 4.1354,
"step": 36000
},
{
"epoch": 0.5372262773722628,
"grad_norm": 0.9080061912536621,
"learning_rate": 0.00018925714786239317,
"loss": 4.1294,
"step": 36800
},
{
"epoch": 0.5489051094890511,
"grad_norm": 0.7405328750610352,
"learning_rate": 0.00018902356951510597,
"loss": 4.1279,
"step": 37600
},
{
"epoch": 0.5605839416058395,
"grad_norm": 0.5771759748458862,
"learning_rate": 0.00018878999116781874,
"loss": 4.1094,
"step": 38400
},
{
"epoch": 0.5722627737226277,
"grad_norm": 0.7671827077865601,
"learning_rate": 0.00018855641282053155,
"loss": 4.1127,
"step": 39200
},
{
"epoch": 0.583941605839416,
"grad_norm": 0.7648681998252869,
"learning_rate": 0.00018832283447324435,
"loss": 4.1126,
"step": 40000
},
{
"epoch": 0.5956204379562043,
"grad_norm": 0.8951389789581299,
"learning_rate": 0.00018808925612595715,
"loss": 4.1117,
"step": 40800
},
{
"epoch": 0.6072992700729927,
"grad_norm": 0.6852630972862244,
"learning_rate": 0.00018785567777866992,
"loss": 4.1218,
"step": 41600
},
{
"epoch": 0.618978102189781,
"grad_norm": 0.8639559745788574,
"learning_rate": 0.00018762209943138272,
"loss": 4.1041,
"step": 42400
},
{
"epoch": 0.6306569343065693,
"grad_norm": 0.7715902924537659,
"learning_rate": 0.0001873885210840955,
"loss": 4.1104,
"step": 43200
},
{
"epoch": 0.6423357664233577,
"grad_norm": 0.9155542254447937,
"learning_rate": 0.00018715494273680833,
"loss": 4.1059,
"step": 44000
},
{
"epoch": 0.654014598540146,
"grad_norm": 0.780484676361084,
"learning_rate": 0.0001869213643895211,
"loss": 4.0958,
"step": 44800
},
{
"epoch": 0.6656934306569343,
"grad_norm": 0.6784650087356567,
"learning_rate": 0.0001866877860422339,
"loss": 4.0841,
"step": 45600
},
{
"epoch": 0.6773722627737226,
"grad_norm": 1.1621425151824951,
"learning_rate": 0.00018645420769494668,
"loss": 4.1108,
"step": 46400
},
{
"epoch": 0.689051094890511,
"grad_norm": 0.47390952706336975,
"learning_rate": 0.00018622062934765948,
"loss": 4.0979,
"step": 47200
},
{
"epoch": 0.7007299270072993,
"grad_norm": 0.690737247467041,
"learning_rate": 0.00018598705100037228,
"loss": 4.0978,
"step": 48000
},
{
"epoch": 0.7124087591240876,
"grad_norm": 1.1309623718261719,
"learning_rate": 0.00018575347265308508,
"loss": 4.097,
"step": 48800
},
{
"epoch": 0.724087591240876,
"grad_norm": 0.6210489869117737,
"learning_rate": 0.00018551989430579786,
"loss": 4.0909,
"step": 49600
},
{
"epoch": 0.7357664233576642,
"grad_norm": 0.673042356967926,
"learning_rate": 0.00018528631595851066,
"loss": 4.0867,
"step": 50400
},
{
"epoch": 0.7474452554744525,
"grad_norm": 0.5582263469696045,
"learning_rate": 0.00018505273761122346,
"loss": 4.0917,
"step": 51200
},
{
"epoch": 0.7591240875912408,
"grad_norm": 0.6824519634246826,
"learning_rate": 0.00018481915926393626,
"loss": 4.0833,
"step": 52000
},
{
"epoch": 0.7708029197080292,
"grad_norm": 0.7440599799156189,
"learning_rate": 0.00018458558091664904,
"loss": 4.0798,
"step": 52800
},
{
"epoch": 0.7824817518248175,
"grad_norm": 0.7103509902954102,
"learning_rate": 0.00018435200256936184,
"loss": 4.0903,
"step": 53600
},
{
"epoch": 0.7941605839416058,
"grad_norm": 0.9494150280952454,
"learning_rate": 0.0001841184242220746,
"loss": 4.0924,
"step": 54400
},
{
"epoch": 0.8058394160583942,
"grad_norm": 0.7308784127235413,
"learning_rate": 0.0001838848458747874,
"loss": 4.0965,
"step": 55200
},
{
"epoch": 0.8175182481751825,
"grad_norm": 0.7546706199645996,
"learning_rate": 0.00018365126752750021,
"loss": 4.0852,
"step": 56000
},
{
"epoch": 0.8291970802919708,
"grad_norm": 0.6851491928100586,
"learning_rate": 0.00018341768918021302,
"loss": 4.0805,
"step": 56800
},
{
"epoch": 0.8408759124087591,
"grad_norm": 0.6497614979743958,
"learning_rate": 0.0001831841108329258,
"loss": 4.0908,
"step": 57600
},
{
"epoch": 0.8525547445255475,
"grad_norm": 0.8901756405830383,
"learning_rate": 0.0001829505324856386,
"loss": 4.0665,
"step": 58400
},
{
"epoch": 0.8642335766423358,
"grad_norm": 0.7579403519630432,
"learning_rate": 0.0001827169541383514,
"loss": 4.0739,
"step": 59200
},
{
"epoch": 0.8759124087591241,
"grad_norm": 0.7560231685638428,
"learning_rate": 0.0001824833757910642,
"loss": 4.0696,
"step": 60000
},
{
"epoch": 0.8875912408759125,
"grad_norm": 0.9320287108421326,
"learning_rate": 0.00018224979744377697,
"loss": 4.0674,
"step": 60800
},
{
"epoch": 0.8992700729927007,
"grad_norm": 0.6470750570297241,
"learning_rate": 0.00018201621909648977,
"loss": 4.0664,
"step": 61600
},
{
"epoch": 0.910948905109489,
"grad_norm": 0.7757769227027893,
"learning_rate": 0.00018178264074920255,
"loss": 4.0676,
"step": 62400
},
{
"epoch": 0.9226277372262773,
"grad_norm": 0.9148152470588684,
"learning_rate": 0.00018154906240191535,
"loss": 4.0629,
"step": 63200
},
{
"epoch": 0.9343065693430657,
"grad_norm": 0.7893709540367126,
"learning_rate": 0.00018131548405462815,
"loss": 4.0714,
"step": 64000
},
{
"epoch": 0.945985401459854,
"grad_norm": 0.7790332436561584,
"learning_rate": 0.00018108190570734095,
"loss": 4.0618,
"step": 64800
},
{
"epoch": 0.9576642335766423,
"grad_norm": 1.025794267654419,
"learning_rate": 0.00018084832736005372,
"loss": 4.0622,
"step": 65600
},
{
"epoch": 0.9693430656934306,
"grad_norm": 0.9819601774215698,
"learning_rate": 0.00018061474901276653,
"loss": 4.056,
"step": 66400
},
{
"epoch": 0.981021897810219,
"grad_norm": 0.7966727018356323,
"learning_rate": 0.00018038117066547933,
"loss": 4.0645,
"step": 67200
},
{
"epoch": 0.9927007299270073,
"grad_norm": 0.5256748199462891,
"learning_rate": 0.00018014759231819213,
"loss": 4.0625,
"step": 68000
},
{
"epoch": 1.0043795620437956,
"grad_norm": 1.0585341453552246,
"learning_rate": 0.0001799140139709049,
"loss": 4.0299,
"step": 68800
},
{
"epoch": 1.0160583941605839,
"grad_norm": 0.568466067314148,
"learning_rate": 0.0001796804356236177,
"loss": 3.982,
"step": 69600
},
{
"epoch": 1.0277372262773723,
"grad_norm": 0.651147186756134,
"learning_rate": 0.00017944685727633048,
"loss": 3.9653,
"step": 70400
},
{
"epoch": 1.0394160583941605,
"grad_norm": 0.8875618577003479,
"learning_rate": 0.00017921327892904328,
"loss": 3.9868,
"step": 71200
},
{
"epoch": 1.051094890510949,
"grad_norm": 0.9252369999885559,
"learning_rate": 0.00017897970058175608,
"loss": 3.9729,
"step": 72000
},
{
"epoch": 1.0627737226277372,
"grad_norm": 0.5309298634529114,
"learning_rate": 0.00017874612223446888,
"loss": 3.9855,
"step": 72800
},
{
"epoch": 1.0744525547445256,
"grad_norm": 0.7743874192237854,
"learning_rate": 0.00017851254388718166,
"loss": 3.9786,
"step": 73600
},
{
"epoch": 1.0861313868613138,
"grad_norm": 0.6744789481163025,
"learning_rate": 0.00017827896553989446,
"loss": 3.9776,
"step": 74400
},
{
"epoch": 1.0978102189781023,
"grad_norm": 0.8282249569892883,
"learning_rate": 0.00017804538719260726,
"loss": 3.9781,
"step": 75200
},
{
"epoch": 1.1094890510948905,
"grad_norm": 1.0976659059524536,
"learning_rate": 0.00017781180884532004,
"loss": 3.9795,
"step": 76000
},
{
"epoch": 1.121167883211679,
"grad_norm": 0.560089647769928,
"learning_rate": 0.00017757823049803284,
"loss": 3.9812,
"step": 76800
},
{
"epoch": 1.1328467153284671,
"grad_norm": 0.9681680798530579,
"learning_rate": 0.00017734465215074564,
"loss": 3.9711,
"step": 77600
},
{
"epoch": 1.1445255474452556,
"grad_norm": 0.5735695958137512,
"learning_rate": 0.00017711107380345844,
"loss": 3.9784,
"step": 78400
},
{
"epoch": 1.1562043795620438,
"grad_norm": 0.49498119950294495,
"learning_rate": 0.00017687749545617121,
"loss": 3.9843,
"step": 79200
},
{
"epoch": 1.167883211678832,
"grad_norm": 1.0702383518218994,
"learning_rate": 0.00017664391710888402,
"loss": 3.9929,
"step": 80000
},
{
"epoch": 1.1795620437956205,
"grad_norm": 1.3075828552246094,
"learning_rate": 0.00017641033876159682,
"loss": 3.9816,
"step": 80800
},
{
"epoch": 1.1912408759124087,
"grad_norm": 1.111781120300293,
"learning_rate": 0.0001761767604143096,
"loss": 3.9792,
"step": 81600
},
{
"epoch": 1.2029197080291971,
"grad_norm": 0.9853603839874268,
"learning_rate": 0.0001759431820670224,
"loss": 3.9846,
"step": 82400
},
{
"epoch": 1.2145985401459853,
"grad_norm": 0.6544378399848938,
"learning_rate": 0.0001757096037197352,
"loss": 3.9781,
"step": 83200
},
{
"epoch": 1.2262773722627738,
"grad_norm": 1.1322039365768433,
"learning_rate": 0.00017547602537244797,
"loss": 3.9898,
"step": 84000
},
{
"epoch": 1.237956204379562,
"grad_norm": 0.7854397892951965,
"learning_rate": 0.00017524244702516077,
"loss": 3.9751,
"step": 84800
},
{
"epoch": 1.2496350364963504,
"grad_norm": 1.025718092918396,
"learning_rate": 0.00017500886867787357,
"loss": 3.9901,
"step": 85600
},
{
"epoch": 1.2613138686131387,
"grad_norm": 0.7675819993019104,
"learning_rate": 0.00017477529033058637,
"loss": 3.984,
"step": 86400
},
{
"epoch": 1.2729927007299269,
"grad_norm": 0.8988509774208069,
"learning_rate": 0.00017454171198329915,
"loss": 3.9918,
"step": 87200
},
{
"epoch": 1.2846715328467153,
"grad_norm": 1.1059536933898926,
"learning_rate": 0.00017430813363601195,
"loss": 3.9813,
"step": 88000
},
{
"epoch": 1.2963503649635038,
"grad_norm": 0.5204365253448486,
"learning_rate": 0.00017407455528872475,
"loss": 3.9744,
"step": 88800
},
{
"epoch": 1.308029197080292,
"grad_norm": 0.9890186786651611,
"learning_rate": 0.00017384097694143753,
"loss": 3.9834,
"step": 89600
},
{
"epoch": 1.3197080291970802,
"grad_norm": 1.2235816717147827,
"learning_rate": 0.00017360739859415033,
"loss": 3.993,
"step": 90400
},
{
"epoch": 1.3313868613138686,
"grad_norm": 1.0497245788574219,
"learning_rate": 0.00017337382024686313,
"loss": 3.9901,
"step": 91200
},
{
"epoch": 1.343065693430657,
"grad_norm": 0.926929235458374,
"learning_rate": 0.0001731402418995759,
"loss": 3.9742,
"step": 92000
},
{
"epoch": 1.3547445255474453,
"grad_norm": 0.7672074437141418,
"learning_rate": 0.0001729066635522887,
"loss": 3.9956,
"step": 92800
},
{
"epoch": 1.3664233576642335,
"grad_norm": 0.5817465782165527,
"learning_rate": 0.0001726730852050015,
"loss": 3.9828,
"step": 93600
},
{
"epoch": 1.378102189781022,
"grad_norm": 0.7459368109703064,
"learning_rate": 0.0001724395068577143,
"loss": 3.983,
"step": 94400
},
{
"epoch": 1.3897810218978102,
"grad_norm": 0.9725570678710938,
"learning_rate": 0.00017220592851042708,
"loss": 3.9791,
"step": 95200
},
{
"epoch": 1.4014598540145986,
"grad_norm": 1.300221562385559,
"learning_rate": 0.00017197235016313988,
"loss": 3.986,
"step": 96000
},
{
"epoch": 1.4131386861313868,
"grad_norm": 0.6552464962005615,
"learning_rate": 0.00017173877181585269,
"loss": 3.9816,
"step": 96800
},
{
"epoch": 1.4248175182481753,
"grad_norm": 1.0207213163375854,
"learning_rate": 0.00017150519346856546,
"loss": 3.9733,
"step": 97600
},
{
"epoch": 1.4364963503649635,
"grad_norm": 0.9970253109931946,
"learning_rate": 0.00017127161512127826,
"loss": 3.9834,
"step": 98400
},
{
"epoch": 1.448175182481752,
"grad_norm": 0.908315896987915,
"learning_rate": 0.00017103803677399106,
"loss": 3.9877,
"step": 99200
},
{
"epoch": 1.4598540145985401,
"grad_norm": 0.9726221561431885,
"learning_rate": 0.00017080445842670384,
"loss": 3.977,
"step": 100000
},
{
"epoch": 1.4715328467153284,
"grad_norm": 0.7048055529594421,
"learning_rate": 0.00017057088007941664,
"loss": 3.986,
"step": 100800
},
{
"epoch": 1.4832116788321168,
"grad_norm": 0.5860503911972046,
"learning_rate": 0.00017033730173212944,
"loss": 3.9723,
"step": 101600
},
{
"epoch": 1.4948905109489052,
"grad_norm": 1.0115162134170532,
"learning_rate": 0.00017010372338484224,
"loss": 3.9777,
"step": 102400
},
{
"epoch": 1.5065693430656935,
"grad_norm": 0.9691118597984314,
"learning_rate": 0.00016987014503755502,
"loss": 3.9749,
"step": 103200
},
{
"epoch": 1.5182481751824817,
"grad_norm": 0.7307090759277344,
"learning_rate": 0.00016963656669026782,
"loss": 3.98,
"step": 104000
},
{
"epoch": 1.5299270072992701,
"grad_norm": 0.9633815288543701,
"learning_rate": 0.00016940298834298062,
"loss": 3.9814,
"step": 104800
},
{
"epoch": 1.5416058394160586,
"grad_norm": 0.8716799020767212,
"learning_rate": 0.00016916940999569342,
"loss": 3.9799,
"step": 105600
},
{
"epoch": 1.5532846715328468,
"grad_norm": 1.063167691230774,
"learning_rate": 0.0001689358316484062,
"loss": 3.9828,
"step": 106400
},
{
"epoch": 1.564963503649635,
"grad_norm": 0.9315568804740906,
"learning_rate": 0.000168702253301119,
"loss": 3.9801,
"step": 107200
},
{
"epoch": 1.5766423357664232,
"grad_norm": 1.408599853515625,
"learning_rate": 0.00016846867495383177,
"loss": 3.9741,
"step": 108000
},
{
"epoch": 1.5883211678832116,
"grad_norm": 0.958906352519989,
"learning_rate": 0.00016823509660654457,
"loss": 3.9745,
"step": 108800
},
{
"epoch": 1.6,
"grad_norm": 1.124635934829712,
"learning_rate": 0.00016800151825925737,
"loss": 3.9809,
"step": 109600
},
{
"epoch": 1.6116788321167883,
"grad_norm": 0.8791921138763428,
"learning_rate": 0.00016776793991197018,
"loss": 3.9703,
"step": 110400
},
{
"epoch": 1.6233576642335765,
"grad_norm": 0.6150586009025574,
"learning_rate": 0.00016753436156468295,
"loss": 3.9944,
"step": 111200
},
{
"epoch": 1.635036496350365,
"grad_norm": 0.9286842942237854,
"learning_rate": 0.00016730078321739575,
"loss": 3.972,
"step": 112000
},
{
"epoch": 1.6467153284671534,
"grad_norm": 1.3587080240249634,
"learning_rate": 0.00016706720487010855,
"loss": 3.9649,
"step": 112800
},
{
"epoch": 1.6583941605839416,
"grad_norm": 1.1084531545639038,
"learning_rate": 0.00016683362652282136,
"loss": 3.9833,
"step": 113600
},
{
"epoch": 1.6700729927007298,
"grad_norm": 0.8050103783607483,
"learning_rate": 0.00016660004817553413,
"loss": 3.9743,
"step": 114400
},
{
"epoch": 1.6817518248175183,
"grad_norm": 0.8709130883216858,
"learning_rate": 0.00016636646982824693,
"loss": 3.9626,
"step": 115200
},
{
"epoch": 1.6934306569343067,
"grad_norm": 0.6005585789680481,
"learning_rate": 0.0001661328914809597,
"loss": 3.9638,
"step": 116000
},
{
"epoch": 1.705109489051095,
"grad_norm": 1.2463181018829346,
"learning_rate": 0.0001658993131336725,
"loss": 3.9788,
"step": 116800
},
{
"epoch": 1.7167883211678832,
"grad_norm": 1.1946378946304321,
"learning_rate": 0.0001656657347863853,
"loss": 3.9697,
"step": 117600
},
{
"epoch": 1.7284671532846714,
"grad_norm": 0.7879184484481812,
"learning_rate": 0.0001654321564390981,
"loss": 3.9621,
"step": 118400
},
{
"epoch": 1.7401459854014598,
"grad_norm": 1.1674267053604126,
"learning_rate": 0.00016519857809181088,
"loss": 3.9646,
"step": 119200
},
{
"epoch": 1.7518248175182483,
"grad_norm": 0.980387270450592,
"learning_rate": 0.00016496499974452369,
"loss": 3.972,
"step": 120000
},
{
"epoch": 1.7635036496350365,
"grad_norm": 0.9502540826797485,
"learning_rate": 0.0001647314213972365,
"loss": 3.9702,
"step": 120800
},
{
"epoch": 1.7751824817518247,
"grad_norm": 0.6683688759803772,
"learning_rate": 0.0001644978430499493,
"loss": 3.9704,
"step": 121600
},
{
"epoch": 1.7868613138686131,
"grad_norm": 1.1526950597763062,
"learning_rate": 0.00016426426470266206,
"loss": 3.9644,
"step": 122400
},
{
"epoch": 1.7985401459854016,
"grad_norm": 0.8221763968467712,
"learning_rate": 0.00016403068635537486,
"loss": 3.9677,
"step": 123200
},
{
"epoch": 1.8102189781021898,
"grad_norm": 1.111382246017456,
"learning_rate": 0.00016379710800808764,
"loss": 3.9787,
"step": 124000
},
{
"epoch": 1.821897810218978,
"grad_norm": 1.3993886709213257,
"learning_rate": 0.00016356352966080044,
"loss": 3.9848,
"step": 124800
},
{
"epoch": 1.8335766423357664,
"grad_norm": 0.9269404411315918,
"learning_rate": 0.00016332995131351324,
"loss": 3.9477,
"step": 125600
},
{
"epoch": 1.845255474452555,
"grad_norm": 0.9304395318031311,
"learning_rate": 0.00016309637296622604,
"loss": 3.9742,
"step": 126400
},
{
"epoch": 1.856934306569343,
"grad_norm": 1.1939619779586792,
"learning_rate": 0.00016286279461893882,
"loss": 3.9732,
"step": 127200
},
{
"epoch": 1.8686131386861313,
"grad_norm": 0.963022768497467,
"learning_rate": 0.00016262921627165162,
"loss": 3.9625,
"step": 128000
},
{
"epoch": 1.8802919708029195,
"grad_norm": 0.8013544082641602,
"learning_rate": 0.00016239563792436442,
"loss": 3.9648,
"step": 128800
},
{
"epoch": 1.891970802919708,
"grad_norm": 1.1415009498596191,
"learning_rate": 0.00016216205957707722,
"loss": 3.9715,
"step": 129600
},
{
"epoch": 1.9036496350364964,
"grad_norm": 0.9819127917289734,
"learning_rate": 0.00016192848122979,
"loss": 3.9624,
"step": 130400
},
{
"epoch": 1.9153284671532846,
"grad_norm": 0.7112650871276855,
"learning_rate": 0.0001616949028825028,
"loss": 3.9607,
"step": 131200
},
{
"epoch": 1.9270072992700729,
"grad_norm": 0.7825914025306702,
"learning_rate": 0.00016146132453521557,
"loss": 3.9492,
"step": 132000
},
{
"epoch": 1.9386861313868613,
"grad_norm": 1.136427402496338,
"learning_rate": 0.0001612277461879284,
"loss": 3.9607,
"step": 132800
},
{
"epoch": 1.9503649635036497,
"grad_norm": 1.004979133605957,
"learning_rate": 0.00016099416784064118,
"loss": 3.9734,
"step": 133600
},
{
"epoch": 1.962043795620438,
"grad_norm": 1.0205179452896118,
"learning_rate": 0.00016076058949335398,
"loss": 3.9629,
"step": 134400
},
{
"epoch": 1.9737226277372262,
"grad_norm": 0.6552355885505676,
"learning_rate": 0.00016052701114606675,
"loss": 3.9705,
"step": 135200
},
{
"epoch": 1.9854014598540146,
"grad_norm": 0.9943566918373108,
"learning_rate": 0.00016029343279877955,
"loss": 3.9582,
"step": 136000
},
{
"epoch": 1.997080291970803,
"grad_norm": 1.0246118307113647,
"learning_rate": 0.00016005985445149236,
"loss": 3.9668,
"step": 136800
},
{
"epoch": 2.0087591240875913,
"grad_norm": 0.783827006816864,
"learning_rate": 0.00015982627610420516,
"loss": 3.881,
"step": 137600
},
{
"epoch": 2.0204379562043795,
"grad_norm": 0.7887817025184631,
"learning_rate": 0.00015959269775691793,
"loss": 3.8402,
"step": 138400
},
{
"epoch": 2.0321167883211677,
"grad_norm": 0.9230135679244995,
"learning_rate": 0.00015935911940963073,
"loss": 3.8429,
"step": 139200
},
{
"epoch": 2.0437956204379564,
"grad_norm": 0.9154564738273621,
"learning_rate": 0.0001591255410623435,
"loss": 3.8557,
"step": 140000
},
{
"epoch": 2.0554744525547446,
"grad_norm": 1.6573781967163086,
"learning_rate": 0.00015889196271505634,
"loss": 3.8501,
"step": 140800
},
{
"epoch": 2.067153284671533,
"grad_norm": 0.9853724241256714,
"learning_rate": 0.0001586583843677691,
"loss": 3.847,
"step": 141600
},
{
"epoch": 2.078832116788321,
"grad_norm": 1.1365453004837036,
"learning_rate": 0.0001584248060204819,
"loss": 3.8616,
"step": 142400
},
{
"epoch": 2.0905109489051097,
"grad_norm": 1.0716379880905151,
"learning_rate": 0.00015819122767319469,
"loss": 3.8607,
"step": 143200
},
{
"epoch": 2.102189781021898,
"grad_norm": 0.862193763256073,
"learning_rate": 0.0001579576493259075,
"loss": 3.8566,
"step": 144000
},
{
"epoch": 2.113868613138686,
"grad_norm": 0.6875022649765015,
"learning_rate": 0.0001577240709786203,
"loss": 3.8737,
"step": 144800
},
{
"epoch": 2.1255474452554743,
"grad_norm": 0.7993234992027283,
"learning_rate": 0.0001574904926313331,
"loss": 3.8699,
"step": 145600
},
{
"epoch": 2.137226277372263,
"grad_norm": 1.0347297191619873,
"learning_rate": 0.00015725691428404587,
"loss": 3.8774,
"step": 146400
},
{
"epoch": 2.1489051094890512,
"grad_norm": 0.8707027435302734,
"learning_rate": 0.00015702333593675867,
"loss": 3.8687,
"step": 147200
},
{
"epoch": 2.1605839416058394,
"grad_norm": 1.2627824544906616,
"learning_rate": 0.00015678975758947144,
"loss": 3.8672,
"step": 148000
},
{
"epoch": 2.1722627737226277,
"grad_norm": 1.1346710920333862,
"learning_rate": 0.00015655617924218427,
"loss": 3.8797,
"step": 148800
},
{
"epoch": 2.183941605839416,
"grad_norm": 1.2239959239959717,
"learning_rate": 0.00015632260089489704,
"loss": 3.8604,
"step": 149600
},
{
"epoch": 2.1956204379562045,
"grad_norm": 1.1624715328216553,
"learning_rate": 0.00015608902254760985,
"loss": 3.8678,
"step": 150400
},
{
"epoch": 2.2072992700729928,
"grad_norm": 0.9525280594825745,
"learning_rate": 0.00015585544420032262,
"loss": 3.8617,
"step": 151200
},
{
"epoch": 2.218978102189781,
"grad_norm": 0.6676674485206604,
"learning_rate": 0.00015562186585303542,
"loss": 3.8645,
"step": 152000
},
{
"epoch": 2.230656934306569,
"grad_norm": 1.4291656017303467,
"learning_rate": 0.00015538828750574822,
"loss": 3.8721,
"step": 152800
},
{
"epoch": 2.242335766423358,
"grad_norm": 2.071485996246338,
"learning_rate": 0.00015515470915846102,
"loss": 3.881,
"step": 153600
},
{
"epoch": 2.254014598540146,
"grad_norm": 1.3130428791046143,
"learning_rate": 0.0001549211308111738,
"loss": 3.8798,
"step": 154400
},
{
"epoch": 2.2656934306569343,
"grad_norm": 1.0672556161880493,
"learning_rate": 0.0001546875524638866,
"loss": 3.8767,
"step": 155200
},
{
"epoch": 2.2773722627737225,
"grad_norm": 0.8703996539115906,
"learning_rate": 0.0001544539741165994,
"loss": 3.8696,
"step": 156000
},
{
"epoch": 2.289051094890511,
"grad_norm": 1.0338706970214844,
"learning_rate": 0.0001542203957693122,
"loss": 3.8857,
"step": 156800
},
{
"epoch": 2.3007299270072994,
"grad_norm": 0.9246997833251953,
"learning_rate": 0.00015398681742202498,
"loss": 3.884,
"step": 157600
},
{
"epoch": 2.3124087591240876,
"grad_norm": 0.7899117469787598,
"learning_rate": 0.00015375323907473778,
"loss": 3.8891,
"step": 158400
},
{
"epoch": 2.324087591240876,
"grad_norm": 0.545261800289154,
"learning_rate": 0.00015351966072745055,
"loss": 3.8702,
"step": 159200
},
{
"epoch": 2.335766423357664,
"grad_norm": 0.6720581650733948,
"learning_rate": 0.00015328608238016338,
"loss": 3.8962,
"step": 160000
},
{
"epoch": 2.3474452554744527,
"grad_norm": 0.7954906821250916,
"learning_rate": 0.00015305250403287616,
"loss": 3.8752,
"step": 160800
},
{
"epoch": 2.359124087591241,
"grad_norm": 0.6123435497283936,
"learning_rate": 0.00015281892568558896,
"loss": 3.8838,
"step": 161600
},
{
"epoch": 2.370802919708029,
"grad_norm": 1.0467143058776855,
"learning_rate": 0.00015258534733830173,
"loss": 3.8798,
"step": 162400
},
{
"epoch": 2.3824817518248174,
"grad_norm": 0.9251344799995422,
"learning_rate": 0.00015235176899101453,
"loss": 3.8801,
"step": 163200
},
{
"epoch": 2.394160583941606,
"grad_norm": 1.2078486680984497,
"learning_rate": 0.00015211819064372734,
"loss": 3.8815,
"step": 164000
},
{
"epoch": 2.4058394160583942,
"grad_norm": 1.1758290529251099,
"learning_rate": 0.00015188461229644014,
"loss": 3.8854,
"step": 164800
},
{
"epoch": 2.4175182481751825,
"grad_norm": 1.2341893911361694,
"learning_rate": 0.0001516510339491529,
"loss": 3.8736,
"step": 165600
},
{
"epoch": 2.4291970802919707,
"grad_norm": 0.85239177942276,
"learning_rate": 0.00015141745560186571,
"loss": 3.8821,
"step": 166400
},
{
"epoch": 2.4408759124087593,
"grad_norm": 0.7495572566986084,
"learning_rate": 0.0001511838772545785,
"loss": 3.8852,
"step": 167200
},
{
"epoch": 2.4525547445255476,
"grad_norm": 0.9310500621795654,
"learning_rate": 0.00015095029890729132,
"loss": 3.8904,
"step": 168000
},
{
"epoch": 2.4642335766423358,
"grad_norm": 1.059832215309143,
"learning_rate": 0.0001507167205600041,
"loss": 3.8716,
"step": 168800
},
{
"epoch": 2.475912408759124,
"grad_norm": 1.6281853914260864,
"learning_rate": 0.0001504831422127169,
"loss": 3.8841,
"step": 169600
},
{
"epoch": 2.487591240875912,
"grad_norm": 0.7431134581565857,
"learning_rate": 0.00015024956386542967,
"loss": 3.8971,
"step": 170400
},
{
"epoch": 2.499270072992701,
"grad_norm": 1.9159060716629028,
"learning_rate": 0.00015001598551814247,
"loss": 3.8938,
"step": 171200
},
{
"epoch": 2.510948905109489,
"grad_norm": 0.8651421070098877,
"learning_rate": 0.00014978240717085527,
"loss": 3.8959,
"step": 172000
},
{
"epoch": 2.5226277372262773,
"grad_norm": 0.6994857788085938,
"learning_rate": 0.00014954882882356807,
"loss": 3.9002,
"step": 172800
},
{
"epoch": 2.5343065693430655,
"grad_norm": 0.9615395069122314,
"learning_rate": 0.00014931525047628085,
"loss": 3.9101,
"step": 173600
},
{
"epoch": 2.5459854014598537,
"grad_norm": 1.0740983486175537,
"learning_rate": 0.00014908167212899365,
"loss": 3.8918,
"step": 174400
},
{
"epoch": 2.5576642335766424,
"grad_norm": 0.5677556991577148,
"learning_rate": 0.00014884809378170642,
"loss": 3.8898,
"step": 175200
},
{
"epoch": 2.5693430656934306,
"grad_norm": 0.8451770544052124,
"learning_rate": 0.00014861451543441925,
"loss": 3.8997,
"step": 176000
},
{
"epoch": 2.581021897810219,
"grad_norm": 0.9004770517349243,
"learning_rate": 0.00014838093708713202,
"loss": 3.8918,
"step": 176800
},
{
"epoch": 2.5927007299270075,
"grad_norm": 1.2636882066726685,
"learning_rate": 0.00014814735873984483,
"loss": 3.8952,
"step": 177600
},
{
"epoch": 2.6043795620437957,
"grad_norm": 1.2926832437515259,
"learning_rate": 0.0001479137803925576,
"loss": 3.8955,
"step": 178400
},
{
"epoch": 2.616058394160584,
"grad_norm": 1.2102513313293457,
"learning_rate": 0.00014768020204527043,
"loss": 3.883,
"step": 179200
},
{
"epoch": 2.627737226277372,
"grad_norm": 0.8510094285011292,
"learning_rate": 0.0001474466236979832,
"loss": 3.8857,
"step": 180000
},
{
"epoch": 2.6394160583941604,
"grad_norm": 1.0621333122253418,
"learning_rate": 0.000147213045350696,
"loss": 3.8913,
"step": 180800
},
{
"epoch": 2.651094890510949,
"grad_norm": 0.6437257528305054,
"learning_rate": 0.00014697946700340878,
"loss": 3.8867,
"step": 181600
},
{
"epoch": 2.6627737226277373,
"grad_norm": 1.1535508632659912,
"learning_rate": 0.00014674588865612158,
"loss": 3.8867,
"step": 182400
},
{
"epoch": 2.6744525547445255,
"grad_norm": 0.8997545838356018,
"learning_rate": 0.00014651231030883438,
"loss": 3.8981,
"step": 183200
},
{
"epoch": 2.686131386861314,
"grad_norm": 1.1245380640029907,
"learning_rate": 0.00014627873196154718,
"loss": 3.889,
"step": 184000
},
{
"epoch": 2.697810218978102,
"grad_norm": 1.0414170026779175,
"learning_rate": 0.00014604515361425996,
"loss": 3.8838,
"step": 184800
},
{
"epoch": 2.7094890510948906,
"grad_norm": 0.8895809054374695,
"learning_rate": 0.00014581157526697276,
"loss": 3.898,
"step": 185600
},
{
"epoch": 2.721167883211679,
"grad_norm": 0.9932270050048828,
"learning_rate": 0.00014557799691968553,
"loss": 3.8895,
"step": 186400
},
{
"epoch": 2.732846715328467,
"grad_norm": 0.9618707895278931,
"learning_rate": 0.00014534441857239836,
"loss": 3.8977,
"step": 187200
},
{
"epoch": 2.7445255474452557,
"grad_norm": 0.6126194000244141,
"learning_rate": 0.00014511084022511114,
"loss": 3.8931,
"step": 188000
},
{
"epoch": 2.756204379562044,
"grad_norm": 0.9863024353981018,
"learning_rate": 0.00014487726187782394,
"loss": 3.883,
"step": 188800
},
{
"epoch": 2.767883211678832,
"grad_norm": 0.9130956530570984,
"learning_rate": 0.00014464368353053671,
"loss": 3.8944,
"step": 189600
},
{
"epoch": 2.7795620437956203,
"grad_norm": 1.2602386474609375,
"learning_rate": 0.00014441010518324952,
"loss": 3.8864,
"step": 190400
},
{
"epoch": 2.7912408759124085,
"grad_norm": 0.5719624161720276,
"learning_rate": 0.00014417652683596232,
"loss": 3.8885,
"step": 191200
},
{
"epoch": 2.802919708029197,
"grad_norm": 0.846307635307312,
"learning_rate": 0.00014394294848867512,
"loss": 3.9102,
"step": 192000
},
{
"epoch": 2.8145985401459854,
"grad_norm": 1.140491247177124,
"learning_rate": 0.0001437093701413879,
"loss": 3.9015,
"step": 192800
},
{
"epoch": 2.8262773722627736,
"grad_norm": 0.8926886320114136,
"learning_rate": 0.0001434757917941007,
"loss": 3.8829,
"step": 193600
},
{
"epoch": 2.8379562043795623,
"grad_norm": 0.9810717105865479,
"learning_rate": 0.00014324221344681347,
"loss": 3.8978,
"step": 194400
},
{
"epoch": 2.8496350364963505,
"grad_norm": 1.0074169635772705,
"learning_rate": 0.0001430086350995263,
"loss": 3.8919,
"step": 195200
},
{
"epoch": 2.8613138686131387,
"grad_norm": 0.9317566156387329,
"learning_rate": 0.00014277505675223907,
"loss": 3.9084,
"step": 196000
},
{
"epoch": 2.872992700729927,
"grad_norm": 1.5248185396194458,
"learning_rate": 0.00014254147840495187,
"loss": 3.8973,
"step": 196800
},
{
"epoch": 2.884671532846715,
"grad_norm": 0.8181623816490173,
"learning_rate": 0.00014230790005766465,
"loss": 3.8989,
"step": 197600
},
{
"epoch": 2.896350364963504,
"grad_norm": 1.0604304075241089,
"learning_rate": 0.00014207432171037745,
"loss": 3.8824,
"step": 198400
},
{
"epoch": 2.908029197080292,
"grad_norm": 1.1073662042617798,
"learning_rate": 0.00014184074336309025,
"loss": 3.883,
"step": 199200
},
{
"epoch": 2.9197080291970803,
"grad_norm": 1.4300545454025269,
"learning_rate": 0.00014160716501580305,
"loss": 3.8935,
"step": 200000
},
{
"epoch": 2.9313868613138685,
"grad_norm": 0.9314271807670593,
"learning_rate": 0.00014137358666851583,
"loss": 3.8885,
"step": 200800
},
{
"epoch": 2.9430656934306567,
"grad_norm": 1.5918676853179932,
"learning_rate": 0.00014114000832122863,
"loss": 3.8834,
"step": 201600
},
{
"epoch": 2.9547445255474454,
"grad_norm": 0.9951960444450378,
"learning_rate": 0.0001409064299739414,
"loss": 3.8937,
"step": 202400
},
{
"epoch": 2.9664233576642336,
"grad_norm": 1.1108194589614868,
"learning_rate": 0.00014067285162665423,
"loss": 3.8979,
"step": 203200
},
{
"epoch": 2.978102189781022,
"grad_norm": 1.1282997131347656,
"learning_rate": 0.000140439273279367,
"loss": 3.8962,
"step": 204000
},
{
"epoch": 2.9897810218978105,
"grad_norm": 1.1506567001342773,
"learning_rate": 0.0001402056949320798,
"loss": 3.8997,
"step": 204800
},
{
"epoch": 3.0014598540145987,
"grad_norm": 1.0932163000106812,
"learning_rate": 0.00013997211658479258,
"loss": 3.8628,
"step": 205600
},
{
"epoch": 3.013138686131387,
"grad_norm": 1.4452706575393677,
"learning_rate": 0.0001397385382375054,
"loss": 3.7579,
"step": 206400
},
{
"epoch": 3.024817518248175,
"grad_norm": 1.5414268970489502,
"learning_rate": 0.00013950495989021818,
"loss": 3.7585,
"step": 207200
},
{
"epoch": 3.0364963503649633,
"grad_norm": 0.8187559843063354,
"learning_rate": 0.00013927138154293099,
"loss": 3.7596,
"step": 208000
},
{
"epoch": 3.048175182481752,
"grad_norm": 1.4612154960632324,
"learning_rate": 0.00013903780319564376,
"loss": 3.7583,
"step": 208800
},
{
"epoch": 3.0598540145985402,
"grad_norm": 0.7291022539138794,
"learning_rate": 0.00013880422484835656,
"loss": 3.753,
"step": 209600
},
{
"epoch": 3.0715328467153284,
"grad_norm": 0.9759907126426697,
"learning_rate": 0.00013857064650106936,
"loss": 3.7531,
"step": 210400
},
{
"epoch": 3.0832116788321167,
"grad_norm": 0.8981759548187256,
"learning_rate": 0.00013833706815378217,
"loss": 3.7598,
"step": 211200
},
{
"epoch": 3.094890510948905,
"grad_norm": 0.9674969911575317,
"learning_rate": 0.00013810348980649494,
"loss": 3.7593,
"step": 212000
},
{
"epoch": 3.1065693430656935,
"grad_norm": 1.410812497138977,
"learning_rate": 0.00013786991145920774,
"loss": 3.7674,
"step": 212800
},
{
"epoch": 3.1182481751824818,
"grad_norm": 1.0926766395568848,
"learning_rate": 0.00013763633311192052,
"loss": 3.7617,
"step": 213600
},
{
"epoch": 3.12992700729927,
"grad_norm": 0.9864111542701721,
"learning_rate": 0.00013740275476463334,
"loss": 3.7752,
"step": 214400
},
{
"epoch": 3.141605839416058,
"grad_norm": 0.8889511823654175,
"learning_rate": 0.00013716917641734612,
"loss": 3.7727,
"step": 215200
},
{
"epoch": 3.153284671532847,
"grad_norm": 0.8842699527740479,
"learning_rate": 0.00013693559807005892,
"loss": 3.7694,
"step": 216000
},
{
"epoch": 3.164963503649635,
"grad_norm": 1.7299695014953613,
"learning_rate": 0.0001367020197227717,
"loss": 3.7738,
"step": 216800
},
{
"epoch": 3.1766423357664233,
"grad_norm": 0.9839622974395752,
"learning_rate": 0.0001364684413754845,
"loss": 3.7947,
"step": 217600
},
{
"epoch": 3.1883211678832115,
"grad_norm": 1.3334280252456665,
"learning_rate": 0.0001362348630281973,
"loss": 3.7778,
"step": 218400
},
{
"epoch": 3.2,
"grad_norm": 1.234206199645996,
"learning_rate": 0.0001360012846809101,
"loss": 3.7756,
"step": 219200
},
{
"epoch": 3.2116788321167884,
"grad_norm": 0.7193965315818787,
"learning_rate": 0.00013576770633362287,
"loss": 3.7879,
"step": 220000
},
{
"epoch": 3.2233576642335766,
"grad_norm": 1.1315131187438965,
"learning_rate": 0.00013553412798633568,
"loss": 3.7775,
"step": 220800
},
{
"epoch": 3.235036496350365,
"grad_norm": 1.6959398984909058,
"learning_rate": 0.00013530054963904845,
"loss": 3.7813,
"step": 221600
},
{
"epoch": 3.246715328467153,
"grad_norm": 2.144179344177246,
"learning_rate": 0.00013506697129176128,
"loss": 3.7948,
"step": 222400
},
{
"epoch": 3.2583941605839417,
"grad_norm": 1.4156116247177124,
"learning_rate": 0.00013483339294447405,
"loss": 3.7894,
"step": 223200
},
{
"epoch": 3.27007299270073,
"grad_norm": 0.8479212522506714,
"learning_rate": 0.00013459981459718685,
"loss": 3.8035,
"step": 224000
},
{
"epoch": 3.281751824817518,
"grad_norm": 0.8472751975059509,
"learning_rate": 0.00013436623624989963,
"loss": 3.8009,
"step": 224800
},
{
"epoch": 3.293430656934307,
"grad_norm": 1.2888227701187134,
"learning_rate": 0.00013413265790261243,
"loss": 3.7939,
"step": 225600
},
{
"epoch": 3.305109489051095,
"grad_norm": 1.1597789525985718,
"learning_rate": 0.00013389907955532523,
"loss": 3.7926,
"step": 226400
},
{
"epoch": 3.3167883211678832,
"grad_norm": 0.7779558300971985,
"learning_rate": 0.00013366550120803803,
"loss": 3.8011,
"step": 227200
},
{
"epoch": 3.3284671532846715,
"grad_norm": 0.9646685719490051,
"learning_rate": 0.0001334319228607508,
"loss": 3.7926,
"step": 228000
},
{
"epoch": 3.3401459854014597,
"grad_norm": 0.9660009741783142,
"learning_rate": 0.0001331983445134636,
"loss": 3.802,
"step": 228800
},
{
"epoch": 3.3518248175182483,
"grad_norm": 1.1353583335876465,
"learning_rate": 0.00013296476616617638,
"loss": 3.8077,
"step": 229600
},
{
"epoch": 3.3635036496350366,
"grad_norm": 0.976076602935791,
"learning_rate": 0.0001327311878188892,
"loss": 3.7869,
"step": 230400
},
{
"epoch": 3.375182481751825,
"grad_norm": 1.091763973236084,
"learning_rate": 0.000132497609471602,
"loss": 3.7933,
"step": 231200
},
{
"epoch": 3.386861313868613,
"grad_norm": 1.6800352334976196,
"learning_rate": 0.0001322640311243148,
"loss": 3.7964,
"step": 232000
},
{
"epoch": 3.398540145985401,
"grad_norm": 1.7834371328353882,
"learning_rate": 0.00013203045277702756,
"loss": 3.8052,
"step": 232800
},
{
"epoch": 3.41021897810219,
"grad_norm": 1.1066137552261353,
"learning_rate": 0.0001317968744297404,
"loss": 3.8186,
"step": 233600
},
{
"epoch": 3.421897810218978,
"grad_norm": 1.4821542501449585,
"learning_rate": 0.00013156329608245317,
"loss": 3.8012,
"step": 234400
},
{
"epoch": 3.4335766423357663,
"grad_norm": 1.6962345838546753,
"learning_rate": 0.00013132971773516597,
"loss": 3.7981,
"step": 235200
},
{
"epoch": 3.445255474452555,
"grad_norm": 1.6591582298278809,
"learning_rate": 0.00013109613938787874,
"loss": 3.8167,
"step": 236000
},
{
"epoch": 3.456934306569343,
"grad_norm": 0.7325506806373596,
"learning_rate": 0.00013086256104059154,
"loss": 3.8046,
"step": 236800
},
{
"epoch": 3.4686131386861314,
"grad_norm": 1.017753005027771,
"learning_rate": 0.00013062898269330434,
"loss": 3.8098,
"step": 237600
},
{
"epoch": 3.4802919708029196,
"grad_norm": 1.0426437854766846,
"learning_rate": 0.00013039540434601715,
"loss": 3.8045,
"step": 238400
},
{
"epoch": 3.491970802919708,
"grad_norm": 0.8641120791435242,
"learning_rate": 0.00013016182599872992,
"loss": 3.8202,
"step": 239200
},
{
"epoch": 3.5036496350364965,
"grad_norm": 0.7680474519729614,
"learning_rate": 0.00012992824765144272,
"loss": 3.8116,
"step": 240000
},
{
"epoch": 3.5153284671532847,
"grad_norm": 0.8205093741416931,
"learning_rate": 0.0001296946693041555,
"loss": 3.8002,
"step": 240800
},
{
"epoch": 3.527007299270073,
"grad_norm": 1.1150528192520142,
"learning_rate": 0.00012946109095686833,
"loss": 3.8154,
"step": 241600
},
{
"epoch": 3.538686131386861,
"grad_norm": 0.9264869689941406,
"learning_rate": 0.0001292275126095811,
"loss": 3.7957,
"step": 242400
},
{
"epoch": 3.5503649635036494,
"grad_norm": 0.9504124522209167,
"learning_rate": 0.0001289939342622939,
"loss": 3.8059,
"step": 243200
},
{
"epoch": 3.562043795620438,
"grad_norm": 0.6638396382331848,
"learning_rate": 0.00012876035591500668,
"loss": 3.8171,
"step": 244000
},
{
"epoch": 3.5737226277372263,
"grad_norm": 0.5771734118461609,
"learning_rate": 0.00012852677756771948,
"loss": 3.8241,
"step": 244800
},
{
"epoch": 3.5854014598540145,
"grad_norm": 0.9084689617156982,
"learning_rate": 0.00012829319922043228,
"loss": 3.8075,
"step": 245600
},
{
"epoch": 3.597080291970803,
"grad_norm": 1.1063374280929565,
"learning_rate": 0.00012805962087314508,
"loss": 3.8197,
"step": 246400
},
{
"epoch": 3.6087591240875914,
"grad_norm": 0.9490681886672974,
"learning_rate": 0.00012782604252585785,
"loss": 3.8158,
"step": 247200
},
{
"epoch": 3.6204379562043796,
"grad_norm": 0.771484375,
"learning_rate": 0.00012759246417857066,
"loss": 3.8136,
"step": 248000
},
{
"epoch": 3.632116788321168,
"grad_norm": 1.1464002132415771,
"learning_rate": 0.00012735888583128343,
"loss": 3.8167,
"step": 248800
},
{
"epoch": 3.643795620437956,
"grad_norm": 1.292195200920105,
"learning_rate": 0.00012712530748399626,
"loss": 3.8133,
"step": 249600
},
{
"epoch": 3.6554744525547447,
"grad_norm": 1.0379976034164429,
"learning_rate": 0.00012689172913670903,
"loss": 3.8102,
"step": 250400
},
{
"epoch": 3.667153284671533,
"grad_norm": 1.7028378248214722,
"learning_rate": 0.00012665815078942183,
"loss": 3.8138,
"step": 251200
},
{
"epoch": 3.678832116788321,
"grad_norm": 1.4890276193618774,
"learning_rate": 0.0001264245724421346,
"loss": 3.8195,
"step": 252000
},
{
"epoch": 3.6905109489051093,
"grad_norm": 1.1416970491409302,
"learning_rate": 0.0001261909940948474,
"loss": 3.8083,
"step": 252800
},
{
"epoch": 3.7021897810218976,
"grad_norm": 1.3536219596862793,
"learning_rate": 0.0001259574157475602,
"loss": 3.8155,
"step": 253600
},
{
"epoch": 3.713868613138686,
"grad_norm": 0.939917266368866,
"learning_rate": 0.00012572383740027301,
"loss": 3.8224,
"step": 254400
},
{
"epoch": 3.7255474452554744,
"grad_norm": 0.570955753326416,
"learning_rate": 0.0001254902590529858,
"loss": 3.8202,
"step": 255200
},
{
"epoch": 3.7372262773722627,
"grad_norm": 1.467022180557251,
"learning_rate": 0.0001252566807056986,
"loss": 3.8217,
"step": 256000
},
{
"epoch": 3.7489051094890513,
"grad_norm": 0.7063941955566406,
"learning_rate": 0.00012502310235841136,
"loss": 3.8166,
"step": 256800
},
{
"epoch": 3.7605839416058395,
"grad_norm": 1.1569101810455322,
"learning_rate": 0.0001247895240111242,
"loss": 3.8151,
"step": 257600
},
{
"epoch": 3.7722627737226277,
"grad_norm": 1.2285373210906982,
"learning_rate": 0.00012455594566383697,
"loss": 3.8214,
"step": 258400
},
{
"epoch": 3.783941605839416,
"grad_norm": 0.9570793509483337,
"learning_rate": 0.00012432236731654977,
"loss": 3.8126,
"step": 259200
},
{
"epoch": 3.795620437956204,
"grad_norm": 0.7642357349395752,
"learning_rate": 0.00012408878896926254,
"loss": 3.8039,
"step": 260000
},
{
"epoch": 3.807299270072993,
"grad_norm": 1.2175133228302002,
"learning_rate": 0.00012385521062197537,
"loss": 3.8103,
"step": 260800
},
{
"epoch": 3.818978102189781,
"grad_norm": 0.6660974025726318,
"learning_rate": 0.00012362163227468815,
"loss": 3.8113,
"step": 261600
},
{
"epoch": 3.8306569343065693,
"grad_norm": 1.5753804445266724,
"learning_rate": 0.00012338805392740095,
"loss": 3.8047,
"step": 262400
},
{
"epoch": 3.8423357664233575,
"grad_norm": 0.9252421259880066,
"learning_rate": 0.00012315447558011372,
"loss": 3.8209,
"step": 263200
},
{
"epoch": 3.8540145985401457,
"grad_norm": 1.3272552490234375,
"learning_rate": 0.00012292089723282652,
"loss": 3.8153,
"step": 264000
}
],
"logging_steps": 800,
"max_steps": 685000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 4000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.2911936164773396e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}