{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8540145985401457, "eval_steps": 500, "global_step": 264000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01167883211678832, "grad_norm": 0.7469853758811951, "learning_rate": 0.00019976817349031748, "loss": 4.9807, "step": 800 }, { "epoch": 0.02335766423357664, "grad_norm": 1.120423436164856, "learning_rate": 0.00019953459514303025, "loss": 4.6712, "step": 1600 }, { "epoch": 0.035036496350364967, "grad_norm": 1.064866542816162, "learning_rate": 0.00019930101679574305, "loss": 4.5512, "step": 2400 }, { "epoch": 0.04671532846715328, "grad_norm": 0.8409781455993652, "learning_rate": 0.00019906743844845583, "loss": 4.4947, "step": 3200 }, { "epoch": 0.058394160583941604, "grad_norm": 1.03812575340271, "learning_rate": 0.00019883386010116863, "loss": 4.4487, "step": 4000 }, { "epoch": 0.07007299270072993, "grad_norm": 0.6089211106300354, "learning_rate": 0.00019860028175388143, "loss": 4.4197, "step": 4800 }, { "epoch": 0.08175182481751825, "grad_norm": 1.2314237356185913, "learning_rate": 0.00019836670340659423, "loss": 4.3931, "step": 5600 }, { "epoch": 0.09343065693430656, "grad_norm": 0.6221948862075806, "learning_rate": 0.000198133125059307, "loss": 4.3595, "step": 6400 }, { "epoch": 0.10510948905109489, "grad_norm": 0.7354845404624939, "learning_rate": 0.0001978995467120198, "loss": 4.3489, "step": 7200 }, { "epoch": 0.11678832116788321, "grad_norm": 0.47563284635543823, "learning_rate": 0.00019766596836473258, "loss": 4.3167, "step": 8000 }, { "epoch": 0.12846715328467154, "grad_norm": 0.5970898270606995, "learning_rate": 0.0001974323900174454, "loss": 4.3059, "step": 8800 }, { "epoch": 0.14014598540145987, "grad_norm": 1.0531948804855347, "learning_rate": 0.0001971988116701582, "loss": 4.294, "step": 9600 }, { "epoch": 0.15182481751824817, "grad_norm": 0.6124999523162842, "learning_rate": 0.000196965233322871, "loss": 4.288, "step": 10400 }, { "epoch": 0.1635036496350365, "grad_norm": 0.43388164043426514, "learning_rate": 0.00019673165497558376, "loss": 4.273, "step": 11200 }, { "epoch": 0.17518248175182483, "grad_norm": 1.2883777618408203, "learning_rate": 0.00019649807662829656, "loss": 4.2696, "step": 12000 }, { "epoch": 0.18686131386861313, "grad_norm": 1.0502655506134033, "learning_rate": 0.00019626449828100937, "loss": 4.2584, "step": 12800 }, { "epoch": 0.19854014598540146, "grad_norm": 1.0939903259277344, "learning_rate": 0.00019603091993372217, "loss": 4.2377, "step": 13600 }, { "epoch": 0.21021897810218979, "grad_norm": 0.732036828994751, "learning_rate": 0.00019579734158643494, "loss": 4.2274, "step": 14400 }, { "epoch": 0.22189781021897811, "grad_norm": 0.7493735551834106, "learning_rate": 0.00019556376323914774, "loss": 4.2382, "step": 15200 }, { "epoch": 0.23357664233576642, "grad_norm": 0.7022084593772888, "learning_rate": 0.00019533018489186052, "loss": 4.241, "step": 16000 }, { "epoch": 0.24525547445255474, "grad_norm": 0.6069856882095337, "learning_rate": 0.00019509660654457335, "loss": 4.2072, "step": 16800 }, { "epoch": 0.2569343065693431, "grad_norm": 0.9912208318710327, "learning_rate": 0.00019486302819728612, "loss": 4.2133, "step": 17600 }, { "epoch": 0.2686131386861314, "grad_norm": 0.6228162050247192, "learning_rate": 0.00019462944984999892, "loss": 4.2074, "step": 18400 }, { "epoch": 0.28029197080291973, "grad_norm": 0.6577867269515991, "learning_rate": 0.0001943958715027117, "loss": 4.1989, "step": 19200 }, { "epoch": 0.291970802919708, "grad_norm": 0.5231236815452576, "learning_rate": 0.0001941622931554245, "loss": 4.1921, "step": 20000 }, { "epoch": 0.30364963503649633, "grad_norm": 0.7894455194473267, "learning_rate": 0.0001939287148081373, "loss": 4.1899, "step": 20800 }, { "epoch": 0.31532846715328466, "grad_norm": 0.7088421583175659, "learning_rate": 0.0001936951364608501, "loss": 4.1795, "step": 21600 }, { "epoch": 0.327007299270073, "grad_norm": 0.7622804045677185, "learning_rate": 0.00019346155811356288, "loss": 4.1789, "step": 22400 }, { "epoch": 0.3386861313868613, "grad_norm": 0.41267886757850647, "learning_rate": 0.00019322797976627568, "loss": 4.1695, "step": 23200 }, { "epoch": 0.35036496350364965, "grad_norm": 0.6643211245536804, "learning_rate": 0.00019299440141898848, "loss": 4.1714, "step": 24000 }, { "epoch": 0.362043795620438, "grad_norm": 1.1012191772460938, "learning_rate": 0.00019276082307170128, "loss": 4.1677, "step": 24800 }, { "epoch": 0.37372262773722625, "grad_norm": 0.5746153593063354, "learning_rate": 0.00019252724472441405, "loss": 4.1463, "step": 25600 }, { "epoch": 0.3854014598540146, "grad_norm": 0.6068096160888672, "learning_rate": 0.00019229366637712686, "loss": 4.1643, "step": 26400 }, { "epoch": 0.3970802919708029, "grad_norm": 0.4222017228603363, "learning_rate": 0.00019206008802983963, "loss": 4.1627, "step": 27200 }, { "epoch": 0.40875912408759124, "grad_norm": 1.108417272567749, "learning_rate": 0.00019182650968255246, "loss": 4.1555, "step": 28000 }, { "epoch": 0.42043795620437957, "grad_norm": 0.847973644733429, "learning_rate": 0.00019159293133526523, "loss": 4.1604, "step": 28800 }, { "epoch": 0.4321167883211679, "grad_norm": 0.7368318438529968, "learning_rate": 0.00019135935298797804, "loss": 4.1497, "step": 29600 }, { "epoch": 0.44379562043795623, "grad_norm": 1.041403889656067, "learning_rate": 0.0001911257746406908, "loss": 4.1417, "step": 30400 }, { "epoch": 0.4554744525547445, "grad_norm": 0.6244940757751465, "learning_rate": 0.0001908921962934036, "loss": 4.1642, "step": 31200 }, { "epoch": 0.46715328467153283, "grad_norm": 0.871951162815094, "learning_rate": 0.0001906586179461164, "loss": 4.1358, "step": 32000 }, { "epoch": 0.47883211678832116, "grad_norm": 1.751654028892517, "learning_rate": 0.00019042503959882921, "loss": 4.1393, "step": 32800 }, { "epoch": 0.4905109489051095, "grad_norm": 0.8029406070709229, "learning_rate": 0.000190191461251542, "loss": 4.132, "step": 33600 }, { "epoch": 0.5021897810218978, "grad_norm": 1.084448218345642, "learning_rate": 0.0001899578829042548, "loss": 4.1323, "step": 34400 }, { "epoch": 0.5138686131386861, "grad_norm": 0.8661244511604309, "learning_rate": 0.00018972430455696756, "loss": 4.1388, "step": 35200 }, { "epoch": 0.5255474452554745, "grad_norm": 0.7281563878059387, "learning_rate": 0.0001894907262096804, "loss": 4.1354, "step": 36000 }, { "epoch": 0.5372262773722628, "grad_norm": 0.9080061912536621, "learning_rate": 0.00018925714786239317, "loss": 4.1294, "step": 36800 }, { "epoch": 0.5489051094890511, "grad_norm": 0.7405328750610352, "learning_rate": 0.00018902356951510597, "loss": 4.1279, "step": 37600 }, { "epoch": 0.5605839416058395, "grad_norm": 0.5771759748458862, "learning_rate": 0.00018878999116781874, "loss": 4.1094, "step": 38400 }, { "epoch": 0.5722627737226277, "grad_norm": 0.7671827077865601, "learning_rate": 0.00018855641282053155, "loss": 4.1127, "step": 39200 }, { "epoch": 0.583941605839416, "grad_norm": 0.7648681998252869, "learning_rate": 0.00018832283447324435, "loss": 4.1126, "step": 40000 }, { "epoch": 0.5956204379562043, "grad_norm": 0.8951389789581299, "learning_rate": 0.00018808925612595715, "loss": 4.1117, "step": 40800 }, { "epoch": 0.6072992700729927, "grad_norm": 0.6852630972862244, "learning_rate": 0.00018785567777866992, "loss": 4.1218, "step": 41600 }, { "epoch": 0.618978102189781, "grad_norm": 0.8639559745788574, "learning_rate": 0.00018762209943138272, "loss": 4.1041, "step": 42400 }, { "epoch": 0.6306569343065693, "grad_norm": 0.7715902924537659, "learning_rate": 0.0001873885210840955, "loss": 4.1104, "step": 43200 }, { "epoch": 0.6423357664233577, "grad_norm": 0.9155542254447937, "learning_rate": 0.00018715494273680833, "loss": 4.1059, "step": 44000 }, { "epoch": 0.654014598540146, "grad_norm": 0.780484676361084, "learning_rate": 0.0001869213643895211, "loss": 4.0958, "step": 44800 }, { "epoch": 0.6656934306569343, "grad_norm": 0.6784650087356567, "learning_rate": 0.0001866877860422339, "loss": 4.0841, "step": 45600 }, { "epoch": 0.6773722627737226, "grad_norm": 1.1621425151824951, "learning_rate": 0.00018645420769494668, "loss": 4.1108, "step": 46400 }, { "epoch": 0.689051094890511, "grad_norm": 0.47390952706336975, "learning_rate": 0.00018622062934765948, "loss": 4.0979, "step": 47200 }, { "epoch": 0.7007299270072993, "grad_norm": 0.690737247467041, "learning_rate": 0.00018598705100037228, "loss": 4.0978, "step": 48000 }, { "epoch": 0.7124087591240876, "grad_norm": 1.1309623718261719, "learning_rate": 0.00018575347265308508, "loss": 4.097, "step": 48800 }, { "epoch": 0.724087591240876, "grad_norm": 0.6210489869117737, "learning_rate": 0.00018551989430579786, "loss": 4.0909, "step": 49600 }, { "epoch": 0.7357664233576642, "grad_norm": 0.673042356967926, "learning_rate": 0.00018528631595851066, "loss": 4.0867, "step": 50400 }, { "epoch": 0.7474452554744525, "grad_norm": 0.5582263469696045, "learning_rate": 0.00018505273761122346, "loss": 4.0917, "step": 51200 }, { "epoch": 0.7591240875912408, "grad_norm": 0.6824519634246826, "learning_rate": 0.00018481915926393626, "loss": 4.0833, "step": 52000 }, { "epoch": 0.7708029197080292, "grad_norm": 0.7440599799156189, "learning_rate": 0.00018458558091664904, "loss": 4.0798, "step": 52800 }, { "epoch": 0.7824817518248175, "grad_norm": 0.7103509902954102, "learning_rate": 0.00018435200256936184, "loss": 4.0903, "step": 53600 }, { "epoch": 0.7941605839416058, "grad_norm": 0.9494150280952454, "learning_rate": 0.0001841184242220746, "loss": 4.0924, "step": 54400 }, { "epoch": 0.8058394160583942, "grad_norm": 0.7308784127235413, "learning_rate": 0.0001838848458747874, "loss": 4.0965, "step": 55200 }, { "epoch": 0.8175182481751825, "grad_norm": 0.7546706199645996, "learning_rate": 0.00018365126752750021, "loss": 4.0852, "step": 56000 }, { "epoch": 0.8291970802919708, "grad_norm": 0.6851491928100586, "learning_rate": 0.00018341768918021302, "loss": 4.0805, "step": 56800 }, { "epoch": 0.8408759124087591, "grad_norm": 0.6497614979743958, "learning_rate": 0.0001831841108329258, "loss": 4.0908, "step": 57600 }, { "epoch": 0.8525547445255475, "grad_norm": 0.8901756405830383, "learning_rate": 0.0001829505324856386, "loss": 4.0665, "step": 58400 }, { "epoch": 0.8642335766423358, "grad_norm": 0.7579403519630432, "learning_rate": 0.0001827169541383514, "loss": 4.0739, "step": 59200 }, { "epoch": 0.8759124087591241, "grad_norm": 0.7560231685638428, "learning_rate": 0.0001824833757910642, "loss": 4.0696, "step": 60000 }, { "epoch": 0.8875912408759125, "grad_norm": 0.9320287108421326, "learning_rate": 0.00018224979744377697, "loss": 4.0674, "step": 60800 }, { "epoch": 0.8992700729927007, "grad_norm": 0.6470750570297241, "learning_rate": 0.00018201621909648977, "loss": 4.0664, "step": 61600 }, { "epoch": 0.910948905109489, "grad_norm": 0.7757769227027893, "learning_rate": 0.00018178264074920255, "loss": 4.0676, "step": 62400 }, { "epoch": 0.9226277372262773, "grad_norm": 0.9148152470588684, "learning_rate": 0.00018154906240191535, "loss": 4.0629, "step": 63200 }, { "epoch": 0.9343065693430657, "grad_norm": 0.7893709540367126, "learning_rate": 0.00018131548405462815, "loss": 4.0714, "step": 64000 }, { "epoch": 0.945985401459854, "grad_norm": 0.7790332436561584, "learning_rate": 0.00018108190570734095, "loss": 4.0618, "step": 64800 }, { "epoch": 0.9576642335766423, "grad_norm": 1.025794267654419, "learning_rate": 0.00018084832736005372, "loss": 4.0622, "step": 65600 }, { "epoch": 0.9693430656934306, "grad_norm": 0.9819601774215698, "learning_rate": 0.00018061474901276653, "loss": 4.056, "step": 66400 }, { "epoch": 0.981021897810219, "grad_norm": 0.7966727018356323, "learning_rate": 0.00018038117066547933, "loss": 4.0645, "step": 67200 }, { "epoch": 0.9927007299270073, "grad_norm": 0.5256748199462891, "learning_rate": 0.00018014759231819213, "loss": 4.0625, "step": 68000 }, { "epoch": 1.0043795620437956, "grad_norm": 1.0585341453552246, "learning_rate": 0.0001799140139709049, "loss": 4.0299, "step": 68800 }, { "epoch": 1.0160583941605839, "grad_norm": 0.568466067314148, "learning_rate": 0.0001796804356236177, "loss": 3.982, "step": 69600 }, { "epoch": 1.0277372262773723, "grad_norm": 0.651147186756134, "learning_rate": 0.00017944685727633048, "loss": 3.9653, "step": 70400 }, { "epoch": 1.0394160583941605, "grad_norm": 0.8875618577003479, "learning_rate": 0.00017921327892904328, "loss": 3.9868, "step": 71200 }, { "epoch": 1.051094890510949, "grad_norm": 0.9252369999885559, "learning_rate": 0.00017897970058175608, "loss": 3.9729, "step": 72000 }, { "epoch": 1.0627737226277372, "grad_norm": 0.5309298634529114, "learning_rate": 0.00017874612223446888, "loss": 3.9855, "step": 72800 }, { "epoch": 1.0744525547445256, "grad_norm": 0.7743874192237854, "learning_rate": 0.00017851254388718166, "loss": 3.9786, "step": 73600 }, { "epoch": 1.0861313868613138, "grad_norm": 0.6744789481163025, "learning_rate": 0.00017827896553989446, "loss": 3.9776, "step": 74400 }, { "epoch": 1.0978102189781023, "grad_norm": 0.8282249569892883, "learning_rate": 0.00017804538719260726, "loss": 3.9781, "step": 75200 }, { "epoch": 1.1094890510948905, "grad_norm": 1.0976659059524536, "learning_rate": 0.00017781180884532004, "loss": 3.9795, "step": 76000 }, { "epoch": 1.121167883211679, "grad_norm": 0.560089647769928, "learning_rate": 0.00017757823049803284, "loss": 3.9812, "step": 76800 }, { "epoch": 1.1328467153284671, "grad_norm": 0.9681680798530579, "learning_rate": 0.00017734465215074564, "loss": 3.9711, "step": 77600 }, { "epoch": 1.1445255474452556, "grad_norm": 0.5735695958137512, "learning_rate": 0.00017711107380345844, "loss": 3.9784, "step": 78400 }, { "epoch": 1.1562043795620438, "grad_norm": 0.49498119950294495, "learning_rate": 0.00017687749545617121, "loss": 3.9843, "step": 79200 }, { "epoch": 1.167883211678832, "grad_norm": 1.0702383518218994, "learning_rate": 0.00017664391710888402, "loss": 3.9929, "step": 80000 }, { "epoch": 1.1795620437956205, "grad_norm": 1.3075828552246094, "learning_rate": 0.00017641033876159682, "loss": 3.9816, "step": 80800 }, { "epoch": 1.1912408759124087, "grad_norm": 1.111781120300293, "learning_rate": 0.0001761767604143096, "loss": 3.9792, "step": 81600 }, { "epoch": 1.2029197080291971, "grad_norm": 0.9853603839874268, "learning_rate": 0.0001759431820670224, "loss": 3.9846, "step": 82400 }, { "epoch": 1.2145985401459853, "grad_norm": 0.6544378399848938, "learning_rate": 0.0001757096037197352, "loss": 3.9781, "step": 83200 }, { "epoch": 1.2262773722627738, "grad_norm": 1.1322039365768433, "learning_rate": 0.00017547602537244797, "loss": 3.9898, "step": 84000 }, { "epoch": 1.237956204379562, "grad_norm": 0.7854397892951965, "learning_rate": 0.00017524244702516077, "loss": 3.9751, "step": 84800 }, { "epoch": 1.2496350364963504, "grad_norm": 1.025718092918396, "learning_rate": 0.00017500886867787357, "loss": 3.9901, "step": 85600 }, { "epoch": 1.2613138686131387, "grad_norm": 0.7675819993019104, "learning_rate": 0.00017477529033058637, "loss": 3.984, "step": 86400 }, { "epoch": 1.2729927007299269, "grad_norm": 0.8988509774208069, "learning_rate": 0.00017454171198329915, "loss": 3.9918, "step": 87200 }, { "epoch": 1.2846715328467153, "grad_norm": 1.1059536933898926, "learning_rate": 0.00017430813363601195, "loss": 3.9813, "step": 88000 }, { "epoch": 1.2963503649635038, "grad_norm": 0.5204365253448486, "learning_rate": 0.00017407455528872475, "loss": 3.9744, "step": 88800 }, { "epoch": 1.308029197080292, "grad_norm": 0.9890186786651611, "learning_rate": 0.00017384097694143753, "loss": 3.9834, "step": 89600 }, { "epoch": 1.3197080291970802, "grad_norm": 1.2235816717147827, "learning_rate": 0.00017360739859415033, "loss": 3.993, "step": 90400 }, { "epoch": 1.3313868613138686, "grad_norm": 1.0497245788574219, "learning_rate": 0.00017337382024686313, "loss": 3.9901, "step": 91200 }, { "epoch": 1.343065693430657, "grad_norm": 0.926929235458374, "learning_rate": 0.0001731402418995759, "loss": 3.9742, "step": 92000 }, { "epoch": 1.3547445255474453, "grad_norm": 0.7672074437141418, "learning_rate": 0.0001729066635522887, "loss": 3.9956, "step": 92800 }, { "epoch": 1.3664233576642335, "grad_norm": 0.5817465782165527, "learning_rate": 0.0001726730852050015, "loss": 3.9828, "step": 93600 }, { "epoch": 1.378102189781022, "grad_norm": 0.7459368109703064, "learning_rate": 0.0001724395068577143, "loss": 3.983, "step": 94400 }, { "epoch": 1.3897810218978102, "grad_norm": 0.9725570678710938, "learning_rate": 0.00017220592851042708, "loss": 3.9791, "step": 95200 }, { "epoch": 1.4014598540145986, "grad_norm": 1.300221562385559, "learning_rate": 0.00017197235016313988, "loss": 3.986, "step": 96000 }, { "epoch": 1.4131386861313868, "grad_norm": 0.6552464962005615, "learning_rate": 0.00017173877181585269, "loss": 3.9816, "step": 96800 }, { "epoch": 1.4248175182481753, "grad_norm": 1.0207213163375854, "learning_rate": 0.00017150519346856546, "loss": 3.9733, "step": 97600 }, { "epoch": 1.4364963503649635, "grad_norm": 0.9970253109931946, "learning_rate": 0.00017127161512127826, "loss": 3.9834, "step": 98400 }, { "epoch": 1.448175182481752, "grad_norm": 0.908315896987915, "learning_rate": 0.00017103803677399106, "loss": 3.9877, "step": 99200 }, { "epoch": 1.4598540145985401, "grad_norm": 0.9726221561431885, "learning_rate": 0.00017080445842670384, "loss": 3.977, "step": 100000 }, { "epoch": 1.4715328467153284, "grad_norm": 0.7048055529594421, "learning_rate": 0.00017057088007941664, "loss": 3.986, "step": 100800 }, { "epoch": 1.4832116788321168, "grad_norm": 0.5860503911972046, "learning_rate": 0.00017033730173212944, "loss": 3.9723, "step": 101600 }, { "epoch": 1.4948905109489052, "grad_norm": 1.0115162134170532, "learning_rate": 0.00017010372338484224, "loss": 3.9777, "step": 102400 }, { "epoch": 1.5065693430656935, "grad_norm": 0.9691118597984314, "learning_rate": 0.00016987014503755502, "loss": 3.9749, "step": 103200 }, { "epoch": 1.5182481751824817, "grad_norm": 0.7307090759277344, "learning_rate": 0.00016963656669026782, "loss": 3.98, "step": 104000 }, { "epoch": 1.5299270072992701, "grad_norm": 0.9633815288543701, "learning_rate": 0.00016940298834298062, "loss": 3.9814, "step": 104800 }, { "epoch": 1.5416058394160586, "grad_norm": 0.8716799020767212, "learning_rate": 0.00016916940999569342, "loss": 3.9799, "step": 105600 }, { "epoch": 1.5532846715328468, "grad_norm": 1.063167691230774, "learning_rate": 0.0001689358316484062, "loss": 3.9828, "step": 106400 }, { "epoch": 1.564963503649635, "grad_norm": 0.9315568804740906, "learning_rate": 0.000168702253301119, "loss": 3.9801, "step": 107200 }, { "epoch": 1.5766423357664232, "grad_norm": 1.408599853515625, "learning_rate": 0.00016846867495383177, "loss": 3.9741, "step": 108000 }, { "epoch": 1.5883211678832116, "grad_norm": 0.958906352519989, "learning_rate": 0.00016823509660654457, "loss": 3.9745, "step": 108800 }, { "epoch": 1.6, "grad_norm": 1.124635934829712, "learning_rate": 0.00016800151825925737, "loss": 3.9809, "step": 109600 }, { "epoch": 1.6116788321167883, "grad_norm": 0.8791921138763428, "learning_rate": 0.00016776793991197018, "loss": 3.9703, "step": 110400 }, { "epoch": 1.6233576642335765, "grad_norm": 0.6150586009025574, "learning_rate": 0.00016753436156468295, "loss": 3.9944, "step": 111200 }, { "epoch": 1.635036496350365, "grad_norm": 0.9286842942237854, "learning_rate": 0.00016730078321739575, "loss": 3.972, "step": 112000 }, { "epoch": 1.6467153284671534, "grad_norm": 1.3587080240249634, "learning_rate": 0.00016706720487010855, "loss": 3.9649, "step": 112800 }, { "epoch": 1.6583941605839416, "grad_norm": 1.1084531545639038, "learning_rate": 0.00016683362652282136, "loss": 3.9833, "step": 113600 }, { "epoch": 1.6700729927007298, "grad_norm": 0.8050103783607483, "learning_rate": 0.00016660004817553413, "loss": 3.9743, "step": 114400 }, { "epoch": 1.6817518248175183, "grad_norm": 0.8709130883216858, "learning_rate": 0.00016636646982824693, "loss": 3.9626, "step": 115200 }, { "epoch": 1.6934306569343067, "grad_norm": 0.6005585789680481, "learning_rate": 0.0001661328914809597, "loss": 3.9638, "step": 116000 }, { "epoch": 1.705109489051095, "grad_norm": 1.2463181018829346, "learning_rate": 0.0001658993131336725, "loss": 3.9788, "step": 116800 }, { "epoch": 1.7167883211678832, "grad_norm": 1.1946378946304321, "learning_rate": 0.0001656657347863853, "loss": 3.9697, "step": 117600 }, { "epoch": 1.7284671532846714, "grad_norm": 0.7879184484481812, "learning_rate": 0.0001654321564390981, "loss": 3.9621, "step": 118400 }, { "epoch": 1.7401459854014598, "grad_norm": 1.1674267053604126, "learning_rate": 0.00016519857809181088, "loss": 3.9646, "step": 119200 }, { "epoch": 1.7518248175182483, "grad_norm": 0.980387270450592, "learning_rate": 0.00016496499974452369, "loss": 3.972, "step": 120000 }, { "epoch": 1.7635036496350365, "grad_norm": 0.9502540826797485, "learning_rate": 0.0001647314213972365, "loss": 3.9702, "step": 120800 }, { "epoch": 1.7751824817518247, "grad_norm": 0.6683688759803772, "learning_rate": 0.0001644978430499493, "loss": 3.9704, "step": 121600 }, { "epoch": 1.7868613138686131, "grad_norm": 1.1526950597763062, "learning_rate": 0.00016426426470266206, "loss": 3.9644, "step": 122400 }, { "epoch": 1.7985401459854016, "grad_norm": 0.8221763968467712, "learning_rate": 0.00016403068635537486, "loss": 3.9677, "step": 123200 }, { "epoch": 1.8102189781021898, "grad_norm": 1.111382246017456, "learning_rate": 0.00016379710800808764, "loss": 3.9787, "step": 124000 }, { "epoch": 1.821897810218978, "grad_norm": 1.3993886709213257, "learning_rate": 0.00016356352966080044, "loss": 3.9848, "step": 124800 }, { "epoch": 1.8335766423357664, "grad_norm": 0.9269404411315918, "learning_rate": 0.00016332995131351324, "loss": 3.9477, "step": 125600 }, { "epoch": 1.845255474452555, "grad_norm": 0.9304395318031311, "learning_rate": 0.00016309637296622604, "loss": 3.9742, "step": 126400 }, { "epoch": 1.856934306569343, "grad_norm": 1.1939619779586792, "learning_rate": 0.00016286279461893882, "loss": 3.9732, "step": 127200 }, { "epoch": 1.8686131386861313, "grad_norm": 0.963022768497467, "learning_rate": 0.00016262921627165162, "loss": 3.9625, "step": 128000 }, { "epoch": 1.8802919708029195, "grad_norm": 0.8013544082641602, "learning_rate": 0.00016239563792436442, "loss": 3.9648, "step": 128800 }, { "epoch": 1.891970802919708, "grad_norm": 1.1415009498596191, "learning_rate": 0.00016216205957707722, "loss": 3.9715, "step": 129600 }, { "epoch": 1.9036496350364964, "grad_norm": 0.9819127917289734, "learning_rate": 0.00016192848122979, "loss": 3.9624, "step": 130400 }, { "epoch": 1.9153284671532846, "grad_norm": 0.7112650871276855, "learning_rate": 0.0001616949028825028, "loss": 3.9607, "step": 131200 }, { "epoch": 1.9270072992700729, "grad_norm": 0.7825914025306702, "learning_rate": 0.00016146132453521557, "loss": 3.9492, "step": 132000 }, { "epoch": 1.9386861313868613, "grad_norm": 1.136427402496338, "learning_rate": 0.0001612277461879284, "loss": 3.9607, "step": 132800 }, { "epoch": 1.9503649635036497, "grad_norm": 1.004979133605957, "learning_rate": 0.00016099416784064118, "loss": 3.9734, "step": 133600 }, { "epoch": 1.962043795620438, "grad_norm": 1.0205179452896118, "learning_rate": 0.00016076058949335398, "loss": 3.9629, "step": 134400 }, { "epoch": 1.9737226277372262, "grad_norm": 0.6552355885505676, "learning_rate": 0.00016052701114606675, "loss": 3.9705, "step": 135200 }, { "epoch": 1.9854014598540146, "grad_norm": 0.9943566918373108, "learning_rate": 0.00016029343279877955, "loss": 3.9582, "step": 136000 }, { "epoch": 1.997080291970803, "grad_norm": 1.0246118307113647, "learning_rate": 0.00016005985445149236, "loss": 3.9668, "step": 136800 }, { "epoch": 2.0087591240875913, "grad_norm": 0.783827006816864, "learning_rate": 0.00015982627610420516, "loss": 3.881, "step": 137600 }, { "epoch": 2.0204379562043795, "grad_norm": 0.7887817025184631, "learning_rate": 0.00015959269775691793, "loss": 3.8402, "step": 138400 }, { "epoch": 2.0321167883211677, "grad_norm": 0.9230135679244995, "learning_rate": 0.00015935911940963073, "loss": 3.8429, "step": 139200 }, { "epoch": 2.0437956204379564, "grad_norm": 0.9154564738273621, "learning_rate": 0.0001591255410623435, "loss": 3.8557, "step": 140000 }, { "epoch": 2.0554744525547446, "grad_norm": 1.6573781967163086, "learning_rate": 0.00015889196271505634, "loss": 3.8501, "step": 140800 }, { "epoch": 2.067153284671533, "grad_norm": 0.9853724241256714, "learning_rate": 0.0001586583843677691, "loss": 3.847, "step": 141600 }, { "epoch": 2.078832116788321, "grad_norm": 1.1365453004837036, "learning_rate": 0.0001584248060204819, "loss": 3.8616, "step": 142400 }, { "epoch": 2.0905109489051097, "grad_norm": 1.0716379880905151, "learning_rate": 0.00015819122767319469, "loss": 3.8607, "step": 143200 }, { "epoch": 2.102189781021898, "grad_norm": 0.862193763256073, "learning_rate": 0.0001579576493259075, "loss": 3.8566, "step": 144000 }, { "epoch": 2.113868613138686, "grad_norm": 0.6875022649765015, "learning_rate": 0.0001577240709786203, "loss": 3.8737, "step": 144800 }, { "epoch": 2.1255474452554743, "grad_norm": 0.7993234992027283, "learning_rate": 0.0001574904926313331, "loss": 3.8699, "step": 145600 }, { "epoch": 2.137226277372263, "grad_norm": 1.0347297191619873, "learning_rate": 0.00015725691428404587, "loss": 3.8774, "step": 146400 }, { "epoch": 2.1489051094890512, "grad_norm": 0.8707027435302734, "learning_rate": 0.00015702333593675867, "loss": 3.8687, "step": 147200 }, { "epoch": 2.1605839416058394, "grad_norm": 1.2627824544906616, "learning_rate": 0.00015678975758947144, "loss": 3.8672, "step": 148000 }, { "epoch": 2.1722627737226277, "grad_norm": 1.1346710920333862, "learning_rate": 0.00015655617924218427, "loss": 3.8797, "step": 148800 }, { "epoch": 2.183941605839416, "grad_norm": 1.2239959239959717, "learning_rate": 0.00015632260089489704, "loss": 3.8604, "step": 149600 }, { "epoch": 2.1956204379562045, "grad_norm": 1.1624715328216553, "learning_rate": 0.00015608902254760985, "loss": 3.8678, "step": 150400 }, { "epoch": 2.2072992700729928, "grad_norm": 0.9525280594825745, "learning_rate": 0.00015585544420032262, "loss": 3.8617, "step": 151200 }, { "epoch": 2.218978102189781, "grad_norm": 0.6676674485206604, "learning_rate": 0.00015562186585303542, "loss": 3.8645, "step": 152000 }, { "epoch": 2.230656934306569, "grad_norm": 1.4291656017303467, "learning_rate": 0.00015538828750574822, "loss": 3.8721, "step": 152800 }, { "epoch": 2.242335766423358, "grad_norm": 2.071485996246338, "learning_rate": 0.00015515470915846102, "loss": 3.881, "step": 153600 }, { "epoch": 2.254014598540146, "grad_norm": 1.3130428791046143, "learning_rate": 0.0001549211308111738, "loss": 3.8798, "step": 154400 }, { "epoch": 2.2656934306569343, "grad_norm": 1.0672556161880493, "learning_rate": 0.0001546875524638866, "loss": 3.8767, "step": 155200 }, { "epoch": 2.2773722627737225, "grad_norm": 0.8703996539115906, "learning_rate": 0.0001544539741165994, "loss": 3.8696, "step": 156000 }, { "epoch": 2.289051094890511, "grad_norm": 1.0338706970214844, "learning_rate": 0.0001542203957693122, "loss": 3.8857, "step": 156800 }, { "epoch": 2.3007299270072994, "grad_norm": 0.9246997833251953, "learning_rate": 0.00015398681742202498, "loss": 3.884, "step": 157600 }, { "epoch": 2.3124087591240876, "grad_norm": 0.7899117469787598, "learning_rate": 0.00015375323907473778, "loss": 3.8891, "step": 158400 }, { "epoch": 2.324087591240876, "grad_norm": 0.545261800289154, "learning_rate": 0.00015351966072745055, "loss": 3.8702, "step": 159200 }, { "epoch": 2.335766423357664, "grad_norm": 0.6720581650733948, "learning_rate": 0.00015328608238016338, "loss": 3.8962, "step": 160000 }, { "epoch": 2.3474452554744527, "grad_norm": 0.7954906821250916, "learning_rate": 0.00015305250403287616, "loss": 3.8752, "step": 160800 }, { "epoch": 2.359124087591241, "grad_norm": 0.6123435497283936, "learning_rate": 0.00015281892568558896, "loss": 3.8838, "step": 161600 }, { "epoch": 2.370802919708029, "grad_norm": 1.0467143058776855, "learning_rate": 0.00015258534733830173, "loss": 3.8798, "step": 162400 }, { "epoch": 2.3824817518248174, "grad_norm": 0.9251344799995422, "learning_rate": 0.00015235176899101453, "loss": 3.8801, "step": 163200 }, { "epoch": 2.394160583941606, "grad_norm": 1.2078486680984497, "learning_rate": 0.00015211819064372734, "loss": 3.8815, "step": 164000 }, { "epoch": 2.4058394160583942, "grad_norm": 1.1758290529251099, "learning_rate": 0.00015188461229644014, "loss": 3.8854, "step": 164800 }, { "epoch": 2.4175182481751825, "grad_norm": 1.2341893911361694, "learning_rate": 0.0001516510339491529, "loss": 3.8736, "step": 165600 }, { "epoch": 2.4291970802919707, "grad_norm": 0.85239177942276, "learning_rate": 0.00015141745560186571, "loss": 3.8821, "step": 166400 }, { "epoch": 2.4408759124087593, "grad_norm": 0.7495572566986084, "learning_rate": 0.0001511838772545785, "loss": 3.8852, "step": 167200 }, { "epoch": 2.4525547445255476, "grad_norm": 0.9310500621795654, "learning_rate": 0.00015095029890729132, "loss": 3.8904, "step": 168000 }, { "epoch": 2.4642335766423358, "grad_norm": 1.059832215309143, "learning_rate": 0.0001507167205600041, "loss": 3.8716, "step": 168800 }, { "epoch": 2.475912408759124, "grad_norm": 1.6281853914260864, "learning_rate": 0.0001504831422127169, "loss": 3.8841, "step": 169600 }, { "epoch": 2.487591240875912, "grad_norm": 0.7431134581565857, "learning_rate": 0.00015024956386542967, "loss": 3.8971, "step": 170400 }, { "epoch": 2.499270072992701, "grad_norm": 1.9159060716629028, "learning_rate": 0.00015001598551814247, "loss": 3.8938, "step": 171200 }, { "epoch": 2.510948905109489, "grad_norm": 0.8651421070098877, "learning_rate": 0.00014978240717085527, "loss": 3.8959, "step": 172000 }, { "epoch": 2.5226277372262773, "grad_norm": 0.6994857788085938, "learning_rate": 0.00014954882882356807, "loss": 3.9002, "step": 172800 }, { "epoch": 2.5343065693430655, "grad_norm": 0.9615395069122314, "learning_rate": 0.00014931525047628085, "loss": 3.9101, "step": 173600 }, { "epoch": 2.5459854014598537, "grad_norm": 1.0740983486175537, "learning_rate": 0.00014908167212899365, "loss": 3.8918, "step": 174400 }, { "epoch": 2.5576642335766424, "grad_norm": 0.5677556991577148, "learning_rate": 0.00014884809378170642, "loss": 3.8898, "step": 175200 }, { "epoch": 2.5693430656934306, "grad_norm": 0.8451770544052124, "learning_rate": 0.00014861451543441925, "loss": 3.8997, "step": 176000 }, { "epoch": 2.581021897810219, "grad_norm": 0.9004770517349243, "learning_rate": 0.00014838093708713202, "loss": 3.8918, "step": 176800 }, { "epoch": 2.5927007299270075, "grad_norm": 1.2636882066726685, "learning_rate": 0.00014814735873984483, "loss": 3.8952, "step": 177600 }, { "epoch": 2.6043795620437957, "grad_norm": 1.2926832437515259, "learning_rate": 0.0001479137803925576, "loss": 3.8955, "step": 178400 }, { "epoch": 2.616058394160584, "grad_norm": 1.2102513313293457, "learning_rate": 0.00014768020204527043, "loss": 3.883, "step": 179200 }, { "epoch": 2.627737226277372, "grad_norm": 0.8510094285011292, "learning_rate": 0.0001474466236979832, "loss": 3.8857, "step": 180000 }, { "epoch": 2.6394160583941604, "grad_norm": 1.0621333122253418, "learning_rate": 0.000147213045350696, "loss": 3.8913, "step": 180800 }, { "epoch": 2.651094890510949, "grad_norm": 0.6437257528305054, "learning_rate": 0.00014697946700340878, "loss": 3.8867, "step": 181600 }, { "epoch": 2.6627737226277373, "grad_norm": 1.1535508632659912, "learning_rate": 0.00014674588865612158, "loss": 3.8867, "step": 182400 }, { "epoch": 2.6744525547445255, "grad_norm": 0.8997545838356018, "learning_rate": 0.00014651231030883438, "loss": 3.8981, "step": 183200 }, { "epoch": 2.686131386861314, "grad_norm": 1.1245380640029907, "learning_rate": 0.00014627873196154718, "loss": 3.889, "step": 184000 }, { "epoch": 2.697810218978102, "grad_norm": 1.0414170026779175, "learning_rate": 0.00014604515361425996, "loss": 3.8838, "step": 184800 }, { "epoch": 2.7094890510948906, "grad_norm": 0.8895809054374695, "learning_rate": 0.00014581157526697276, "loss": 3.898, "step": 185600 }, { "epoch": 2.721167883211679, "grad_norm": 0.9932270050048828, "learning_rate": 0.00014557799691968553, "loss": 3.8895, "step": 186400 }, { "epoch": 2.732846715328467, "grad_norm": 0.9618707895278931, "learning_rate": 0.00014534441857239836, "loss": 3.8977, "step": 187200 }, { "epoch": 2.7445255474452557, "grad_norm": 0.6126194000244141, "learning_rate": 0.00014511084022511114, "loss": 3.8931, "step": 188000 }, { "epoch": 2.756204379562044, "grad_norm": 0.9863024353981018, "learning_rate": 0.00014487726187782394, "loss": 3.883, "step": 188800 }, { "epoch": 2.767883211678832, "grad_norm": 0.9130956530570984, "learning_rate": 0.00014464368353053671, "loss": 3.8944, "step": 189600 }, { "epoch": 2.7795620437956203, "grad_norm": 1.2602386474609375, "learning_rate": 0.00014441010518324952, "loss": 3.8864, "step": 190400 }, { "epoch": 2.7912408759124085, "grad_norm": 0.5719624161720276, "learning_rate": 0.00014417652683596232, "loss": 3.8885, "step": 191200 }, { "epoch": 2.802919708029197, "grad_norm": 0.846307635307312, "learning_rate": 0.00014394294848867512, "loss": 3.9102, "step": 192000 }, { "epoch": 2.8145985401459854, "grad_norm": 1.140491247177124, "learning_rate": 0.0001437093701413879, "loss": 3.9015, "step": 192800 }, { "epoch": 2.8262773722627736, "grad_norm": 0.8926886320114136, "learning_rate": 0.0001434757917941007, "loss": 3.8829, "step": 193600 }, { "epoch": 2.8379562043795623, "grad_norm": 0.9810717105865479, "learning_rate": 0.00014324221344681347, "loss": 3.8978, "step": 194400 }, { "epoch": 2.8496350364963505, "grad_norm": 1.0074169635772705, "learning_rate": 0.0001430086350995263, "loss": 3.8919, "step": 195200 }, { "epoch": 2.8613138686131387, "grad_norm": 0.9317566156387329, "learning_rate": 0.00014277505675223907, "loss": 3.9084, "step": 196000 }, { "epoch": 2.872992700729927, "grad_norm": 1.5248185396194458, "learning_rate": 0.00014254147840495187, "loss": 3.8973, "step": 196800 }, { "epoch": 2.884671532846715, "grad_norm": 0.8181623816490173, "learning_rate": 0.00014230790005766465, "loss": 3.8989, "step": 197600 }, { "epoch": 2.896350364963504, "grad_norm": 1.0604304075241089, "learning_rate": 0.00014207432171037745, "loss": 3.8824, "step": 198400 }, { "epoch": 2.908029197080292, "grad_norm": 1.1073662042617798, "learning_rate": 0.00014184074336309025, "loss": 3.883, "step": 199200 }, { "epoch": 2.9197080291970803, "grad_norm": 1.4300545454025269, "learning_rate": 0.00014160716501580305, "loss": 3.8935, "step": 200000 }, { "epoch": 2.9313868613138685, "grad_norm": 0.9314271807670593, "learning_rate": 0.00014137358666851583, "loss": 3.8885, "step": 200800 }, { "epoch": 2.9430656934306567, "grad_norm": 1.5918676853179932, "learning_rate": 0.00014114000832122863, "loss": 3.8834, "step": 201600 }, { "epoch": 2.9547445255474454, "grad_norm": 0.9951960444450378, "learning_rate": 0.0001409064299739414, "loss": 3.8937, "step": 202400 }, { "epoch": 2.9664233576642336, "grad_norm": 1.1108194589614868, "learning_rate": 0.00014067285162665423, "loss": 3.8979, "step": 203200 }, { "epoch": 2.978102189781022, "grad_norm": 1.1282997131347656, "learning_rate": 0.000140439273279367, "loss": 3.8962, "step": 204000 }, { "epoch": 2.9897810218978105, "grad_norm": 1.1506567001342773, "learning_rate": 0.0001402056949320798, "loss": 3.8997, "step": 204800 }, { "epoch": 3.0014598540145987, "grad_norm": 1.0932163000106812, "learning_rate": 0.00013997211658479258, "loss": 3.8628, "step": 205600 }, { "epoch": 3.013138686131387, "grad_norm": 1.4452706575393677, "learning_rate": 0.0001397385382375054, "loss": 3.7579, "step": 206400 }, { "epoch": 3.024817518248175, "grad_norm": 1.5414268970489502, "learning_rate": 0.00013950495989021818, "loss": 3.7585, "step": 207200 }, { "epoch": 3.0364963503649633, "grad_norm": 0.8187559843063354, "learning_rate": 0.00013927138154293099, "loss": 3.7596, "step": 208000 }, { "epoch": 3.048175182481752, "grad_norm": 1.4612154960632324, "learning_rate": 0.00013903780319564376, "loss": 3.7583, "step": 208800 }, { "epoch": 3.0598540145985402, "grad_norm": 0.7291022539138794, "learning_rate": 0.00013880422484835656, "loss": 3.753, "step": 209600 }, { "epoch": 3.0715328467153284, "grad_norm": 0.9759907126426697, "learning_rate": 0.00013857064650106936, "loss": 3.7531, "step": 210400 }, { "epoch": 3.0832116788321167, "grad_norm": 0.8981759548187256, "learning_rate": 0.00013833706815378217, "loss": 3.7598, "step": 211200 }, { "epoch": 3.094890510948905, "grad_norm": 0.9674969911575317, "learning_rate": 0.00013810348980649494, "loss": 3.7593, "step": 212000 }, { "epoch": 3.1065693430656935, "grad_norm": 1.410812497138977, "learning_rate": 0.00013786991145920774, "loss": 3.7674, "step": 212800 }, { "epoch": 3.1182481751824818, "grad_norm": 1.0926766395568848, "learning_rate": 0.00013763633311192052, "loss": 3.7617, "step": 213600 }, { "epoch": 3.12992700729927, "grad_norm": 0.9864111542701721, "learning_rate": 0.00013740275476463334, "loss": 3.7752, "step": 214400 }, { "epoch": 3.141605839416058, "grad_norm": 0.8889511823654175, "learning_rate": 0.00013716917641734612, "loss": 3.7727, "step": 215200 }, { "epoch": 3.153284671532847, "grad_norm": 0.8842699527740479, "learning_rate": 0.00013693559807005892, "loss": 3.7694, "step": 216000 }, { "epoch": 3.164963503649635, "grad_norm": 1.7299695014953613, "learning_rate": 0.0001367020197227717, "loss": 3.7738, "step": 216800 }, { "epoch": 3.1766423357664233, "grad_norm": 0.9839622974395752, "learning_rate": 0.0001364684413754845, "loss": 3.7947, "step": 217600 }, { "epoch": 3.1883211678832115, "grad_norm": 1.3334280252456665, "learning_rate": 0.0001362348630281973, "loss": 3.7778, "step": 218400 }, { "epoch": 3.2, "grad_norm": 1.234206199645996, "learning_rate": 0.0001360012846809101, "loss": 3.7756, "step": 219200 }, { "epoch": 3.2116788321167884, "grad_norm": 0.7193965315818787, "learning_rate": 0.00013576770633362287, "loss": 3.7879, "step": 220000 }, { "epoch": 3.2233576642335766, "grad_norm": 1.1315131187438965, "learning_rate": 0.00013553412798633568, "loss": 3.7775, "step": 220800 }, { "epoch": 3.235036496350365, "grad_norm": 1.6959398984909058, "learning_rate": 0.00013530054963904845, "loss": 3.7813, "step": 221600 }, { "epoch": 3.246715328467153, "grad_norm": 2.144179344177246, "learning_rate": 0.00013506697129176128, "loss": 3.7948, "step": 222400 }, { "epoch": 3.2583941605839417, "grad_norm": 1.4156116247177124, "learning_rate": 0.00013483339294447405, "loss": 3.7894, "step": 223200 }, { "epoch": 3.27007299270073, "grad_norm": 0.8479212522506714, "learning_rate": 0.00013459981459718685, "loss": 3.8035, "step": 224000 }, { "epoch": 3.281751824817518, "grad_norm": 0.8472751975059509, "learning_rate": 0.00013436623624989963, "loss": 3.8009, "step": 224800 }, { "epoch": 3.293430656934307, "grad_norm": 1.2888227701187134, "learning_rate": 0.00013413265790261243, "loss": 3.7939, "step": 225600 }, { "epoch": 3.305109489051095, "grad_norm": 1.1597789525985718, "learning_rate": 0.00013389907955532523, "loss": 3.7926, "step": 226400 }, { "epoch": 3.3167883211678832, "grad_norm": 0.7779558300971985, "learning_rate": 0.00013366550120803803, "loss": 3.8011, "step": 227200 }, { "epoch": 3.3284671532846715, "grad_norm": 0.9646685719490051, "learning_rate": 0.0001334319228607508, "loss": 3.7926, "step": 228000 }, { "epoch": 3.3401459854014597, "grad_norm": 0.9660009741783142, "learning_rate": 0.0001331983445134636, "loss": 3.802, "step": 228800 }, { "epoch": 3.3518248175182483, "grad_norm": 1.1353583335876465, "learning_rate": 0.00013296476616617638, "loss": 3.8077, "step": 229600 }, { "epoch": 3.3635036496350366, "grad_norm": 0.976076602935791, "learning_rate": 0.0001327311878188892, "loss": 3.7869, "step": 230400 }, { "epoch": 3.375182481751825, "grad_norm": 1.091763973236084, "learning_rate": 0.000132497609471602, "loss": 3.7933, "step": 231200 }, { "epoch": 3.386861313868613, "grad_norm": 1.6800352334976196, "learning_rate": 0.0001322640311243148, "loss": 3.7964, "step": 232000 }, { "epoch": 3.398540145985401, "grad_norm": 1.7834371328353882, "learning_rate": 0.00013203045277702756, "loss": 3.8052, "step": 232800 }, { "epoch": 3.41021897810219, "grad_norm": 1.1066137552261353, "learning_rate": 0.0001317968744297404, "loss": 3.8186, "step": 233600 }, { "epoch": 3.421897810218978, "grad_norm": 1.4821542501449585, "learning_rate": 0.00013156329608245317, "loss": 3.8012, "step": 234400 }, { "epoch": 3.4335766423357663, "grad_norm": 1.6962345838546753, "learning_rate": 0.00013132971773516597, "loss": 3.7981, "step": 235200 }, { "epoch": 3.445255474452555, "grad_norm": 1.6591582298278809, "learning_rate": 0.00013109613938787874, "loss": 3.8167, "step": 236000 }, { "epoch": 3.456934306569343, "grad_norm": 0.7325506806373596, "learning_rate": 0.00013086256104059154, "loss": 3.8046, "step": 236800 }, { "epoch": 3.4686131386861314, "grad_norm": 1.017753005027771, "learning_rate": 0.00013062898269330434, "loss": 3.8098, "step": 237600 }, { "epoch": 3.4802919708029196, "grad_norm": 1.0426437854766846, "learning_rate": 0.00013039540434601715, "loss": 3.8045, "step": 238400 }, { "epoch": 3.491970802919708, "grad_norm": 0.8641120791435242, "learning_rate": 0.00013016182599872992, "loss": 3.8202, "step": 239200 }, { "epoch": 3.5036496350364965, "grad_norm": 0.7680474519729614, "learning_rate": 0.00012992824765144272, "loss": 3.8116, "step": 240000 }, { "epoch": 3.5153284671532847, "grad_norm": 0.8205093741416931, "learning_rate": 0.0001296946693041555, "loss": 3.8002, "step": 240800 }, { "epoch": 3.527007299270073, "grad_norm": 1.1150528192520142, "learning_rate": 0.00012946109095686833, "loss": 3.8154, "step": 241600 }, { "epoch": 3.538686131386861, "grad_norm": 0.9264869689941406, "learning_rate": 0.0001292275126095811, "loss": 3.7957, "step": 242400 }, { "epoch": 3.5503649635036494, "grad_norm": 0.9504124522209167, "learning_rate": 0.0001289939342622939, "loss": 3.8059, "step": 243200 }, { "epoch": 3.562043795620438, "grad_norm": 0.6638396382331848, "learning_rate": 0.00012876035591500668, "loss": 3.8171, "step": 244000 }, { "epoch": 3.5737226277372263, "grad_norm": 0.5771734118461609, "learning_rate": 0.00012852677756771948, "loss": 3.8241, "step": 244800 }, { "epoch": 3.5854014598540145, "grad_norm": 0.9084689617156982, "learning_rate": 0.00012829319922043228, "loss": 3.8075, "step": 245600 }, { "epoch": 3.597080291970803, "grad_norm": 1.1063374280929565, "learning_rate": 0.00012805962087314508, "loss": 3.8197, "step": 246400 }, { "epoch": 3.6087591240875914, "grad_norm": 0.9490681886672974, "learning_rate": 0.00012782604252585785, "loss": 3.8158, "step": 247200 }, { "epoch": 3.6204379562043796, "grad_norm": 0.771484375, "learning_rate": 0.00012759246417857066, "loss": 3.8136, "step": 248000 }, { "epoch": 3.632116788321168, "grad_norm": 1.1464002132415771, "learning_rate": 0.00012735888583128343, "loss": 3.8167, "step": 248800 }, { "epoch": 3.643795620437956, "grad_norm": 1.292195200920105, "learning_rate": 0.00012712530748399626, "loss": 3.8133, "step": 249600 }, { "epoch": 3.6554744525547447, "grad_norm": 1.0379976034164429, "learning_rate": 0.00012689172913670903, "loss": 3.8102, "step": 250400 }, { "epoch": 3.667153284671533, "grad_norm": 1.7028378248214722, "learning_rate": 0.00012665815078942183, "loss": 3.8138, "step": 251200 }, { "epoch": 3.678832116788321, "grad_norm": 1.4890276193618774, "learning_rate": 0.0001264245724421346, "loss": 3.8195, "step": 252000 }, { "epoch": 3.6905109489051093, "grad_norm": 1.1416970491409302, "learning_rate": 0.0001261909940948474, "loss": 3.8083, "step": 252800 }, { "epoch": 3.7021897810218976, "grad_norm": 1.3536219596862793, "learning_rate": 0.0001259574157475602, "loss": 3.8155, "step": 253600 }, { "epoch": 3.713868613138686, "grad_norm": 0.939917266368866, "learning_rate": 0.00012572383740027301, "loss": 3.8224, "step": 254400 }, { "epoch": 3.7255474452554744, "grad_norm": 0.570955753326416, "learning_rate": 0.0001254902590529858, "loss": 3.8202, "step": 255200 }, { "epoch": 3.7372262773722627, "grad_norm": 1.467022180557251, "learning_rate": 0.0001252566807056986, "loss": 3.8217, "step": 256000 }, { "epoch": 3.7489051094890513, "grad_norm": 0.7063941955566406, "learning_rate": 0.00012502310235841136, "loss": 3.8166, "step": 256800 }, { "epoch": 3.7605839416058395, "grad_norm": 1.1569101810455322, "learning_rate": 0.0001247895240111242, "loss": 3.8151, "step": 257600 }, { "epoch": 3.7722627737226277, "grad_norm": 1.2285373210906982, "learning_rate": 0.00012455594566383697, "loss": 3.8214, "step": 258400 }, { "epoch": 3.783941605839416, "grad_norm": 0.9570793509483337, "learning_rate": 0.00012432236731654977, "loss": 3.8126, "step": 259200 }, { "epoch": 3.795620437956204, "grad_norm": 0.7642357349395752, "learning_rate": 0.00012408878896926254, "loss": 3.8039, "step": 260000 }, { "epoch": 3.807299270072993, "grad_norm": 1.2175133228302002, "learning_rate": 0.00012385521062197537, "loss": 3.8103, "step": 260800 }, { "epoch": 3.818978102189781, "grad_norm": 0.6660974025726318, "learning_rate": 0.00012362163227468815, "loss": 3.8113, "step": 261600 }, { "epoch": 3.8306569343065693, "grad_norm": 1.5753804445266724, "learning_rate": 0.00012338805392740095, "loss": 3.8047, "step": 262400 }, { "epoch": 3.8423357664233575, "grad_norm": 0.9252421259880066, "learning_rate": 0.00012315447558011372, "loss": 3.8209, "step": 263200 }, { "epoch": 3.8540145985401457, "grad_norm": 1.3272552490234375, "learning_rate": 0.00012292089723282652, "loss": 3.8153, "step": 264000 } ], "logging_steps": 800, "max_steps": 685000, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 4000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 4.2911936164773396e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }