| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 3.8540145985401457, |
| "eval_steps": 500, |
| "global_step": 264000, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.01167883211678832, |
| "grad_norm": 0.7469853758811951, |
| "learning_rate": 0.00019976817349031748, |
| "loss": 4.9807, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.02335766423357664, |
| "grad_norm": 1.120423436164856, |
| "learning_rate": 0.00019953459514303025, |
| "loss": 4.6712, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.035036496350364967, |
| "grad_norm": 1.064866542816162, |
| "learning_rate": 0.00019930101679574305, |
| "loss": 4.5512, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.04671532846715328, |
| "grad_norm": 0.8409781455993652, |
| "learning_rate": 0.00019906743844845583, |
| "loss": 4.4947, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.058394160583941604, |
| "grad_norm": 1.03812575340271, |
| "learning_rate": 0.00019883386010116863, |
| "loss": 4.4487, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.07007299270072993, |
| "grad_norm": 0.6089211106300354, |
| "learning_rate": 0.00019860028175388143, |
| "loss": 4.4197, |
| "step": 4800 |
| }, |
| { |
| "epoch": 0.08175182481751825, |
| "grad_norm": 1.2314237356185913, |
| "learning_rate": 0.00019836670340659423, |
| "loss": 4.3931, |
| "step": 5600 |
| }, |
| { |
| "epoch": 0.09343065693430656, |
| "grad_norm": 0.6221948862075806, |
| "learning_rate": 0.000198133125059307, |
| "loss": 4.3595, |
| "step": 6400 |
| }, |
| { |
| "epoch": 0.10510948905109489, |
| "grad_norm": 0.7354845404624939, |
| "learning_rate": 0.0001978995467120198, |
| "loss": 4.3489, |
| "step": 7200 |
| }, |
| { |
| "epoch": 0.11678832116788321, |
| "grad_norm": 0.47563284635543823, |
| "learning_rate": 0.00019766596836473258, |
| "loss": 4.3167, |
| "step": 8000 |
| }, |
| { |
| "epoch": 0.12846715328467154, |
| "grad_norm": 0.5970898270606995, |
| "learning_rate": 0.0001974323900174454, |
| "loss": 4.3059, |
| "step": 8800 |
| }, |
| { |
| "epoch": 0.14014598540145987, |
| "grad_norm": 1.0531948804855347, |
| "learning_rate": 0.0001971988116701582, |
| "loss": 4.294, |
| "step": 9600 |
| }, |
| { |
| "epoch": 0.15182481751824817, |
| "grad_norm": 0.6124999523162842, |
| "learning_rate": 0.000196965233322871, |
| "loss": 4.288, |
| "step": 10400 |
| }, |
| { |
| "epoch": 0.1635036496350365, |
| "grad_norm": 0.43388164043426514, |
| "learning_rate": 0.00019673165497558376, |
| "loss": 4.273, |
| "step": 11200 |
| }, |
| { |
| "epoch": 0.17518248175182483, |
| "grad_norm": 1.2883777618408203, |
| "learning_rate": 0.00019649807662829656, |
| "loss": 4.2696, |
| "step": 12000 |
| }, |
| { |
| "epoch": 0.18686131386861313, |
| "grad_norm": 1.0502655506134033, |
| "learning_rate": 0.00019626449828100937, |
| "loss": 4.2584, |
| "step": 12800 |
| }, |
| { |
| "epoch": 0.19854014598540146, |
| "grad_norm": 1.0939903259277344, |
| "learning_rate": 0.00019603091993372217, |
| "loss": 4.2377, |
| "step": 13600 |
| }, |
| { |
| "epoch": 0.21021897810218979, |
| "grad_norm": 0.732036828994751, |
| "learning_rate": 0.00019579734158643494, |
| "loss": 4.2274, |
| "step": 14400 |
| }, |
| { |
| "epoch": 0.22189781021897811, |
| "grad_norm": 0.7493735551834106, |
| "learning_rate": 0.00019556376323914774, |
| "loss": 4.2382, |
| "step": 15200 |
| }, |
| { |
| "epoch": 0.23357664233576642, |
| "grad_norm": 0.7022084593772888, |
| "learning_rate": 0.00019533018489186052, |
| "loss": 4.241, |
| "step": 16000 |
| }, |
| { |
| "epoch": 0.24525547445255474, |
| "grad_norm": 0.6069856882095337, |
| "learning_rate": 0.00019509660654457335, |
| "loss": 4.2072, |
| "step": 16800 |
| }, |
| { |
| "epoch": 0.2569343065693431, |
| "grad_norm": 0.9912208318710327, |
| "learning_rate": 0.00019486302819728612, |
| "loss": 4.2133, |
| "step": 17600 |
| }, |
| { |
| "epoch": 0.2686131386861314, |
| "grad_norm": 0.6228162050247192, |
| "learning_rate": 0.00019462944984999892, |
| "loss": 4.2074, |
| "step": 18400 |
| }, |
| { |
| "epoch": 0.28029197080291973, |
| "grad_norm": 0.6577867269515991, |
| "learning_rate": 0.0001943958715027117, |
| "loss": 4.1989, |
| "step": 19200 |
| }, |
| { |
| "epoch": 0.291970802919708, |
| "grad_norm": 0.5231236815452576, |
| "learning_rate": 0.0001941622931554245, |
| "loss": 4.1921, |
| "step": 20000 |
| }, |
| { |
| "epoch": 0.30364963503649633, |
| "grad_norm": 0.7894455194473267, |
| "learning_rate": 0.0001939287148081373, |
| "loss": 4.1899, |
| "step": 20800 |
| }, |
| { |
| "epoch": 0.31532846715328466, |
| "grad_norm": 0.7088421583175659, |
| "learning_rate": 0.0001936951364608501, |
| "loss": 4.1795, |
| "step": 21600 |
| }, |
| { |
| "epoch": 0.327007299270073, |
| "grad_norm": 0.7622804045677185, |
| "learning_rate": 0.00019346155811356288, |
| "loss": 4.1789, |
| "step": 22400 |
| }, |
| { |
| "epoch": 0.3386861313868613, |
| "grad_norm": 0.41267886757850647, |
| "learning_rate": 0.00019322797976627568, |
| "loss": 4.1695, |
| "step": 23200 |
| }, |
| { |
| "epoch": 0.35036496350364965, |
| "grad_norm": 0.6643211245536804, |
| "learning_rate": 0.00019299440141898848, |
| "loss": 4.1714, |
| "step": 24000 |
| }, |
| { |
| "epoch": 0.362043795620438, |
| "grad_norm": 1.1012191772460938, |
| "learning_rate": 0.00019276082307170128, |
| "loss": 4.1677, |
| "step": 24800 |
| }, |
| { |
| "epoch": 0.37372262773722625, |
| "grad_norm": 0.5746153593063354, |
| "learning_rate": 0.00019252724472441405, |
| "loss": 4.1463, |
| "step": 25600 |
| }, |
| { |
| "epoch": 0.3854014598540146, |
| "grad_norm": 0.6068096160888672, |
| "learning_rate": 0.00019229366637712686, |
| "loss": 4.1643, |
| "step": 26400 |
| }, |
| { |
| "epoch": 0.3970802919708029, |
| "grad_norm": 0.4222017228603363, |
| "learning_rate": 0.00019206008802983963, |
| "loss": 4.1627, |
| "step": 27200 |
| }, |
| { |
| "epoch": 0.40875912408759124, |
| "grad_norm": 1.108417272567749, |
| "learning_rate": 0.00019182650968255246, |
| "loss": 4.1555, |
| "step": 28000 |
| }, |
| { |
| "epoch": 0.42043795620437957, |
| "grad_norm": 0.847973644733429, |
| "learning_rate": 0.00019159293133526523, |
| "loss": 4.1604, |
| "step": 28800 |
| }, |
| { |
| "epoch": 0.4321167883211679, |
| "grad_norm": 0.7368318438529968, |
| "learning_rate": 0.00019135935298797804, |
| "loss": 4.1497, |
| "step": 29600 |
| }, |
| { |
| "epoch": 0.44379562043795623, |
| "grad_norm": 1.041403889656067, |
| "learning_rate": 0.0001911257746406908, |
| "loss": 4.1417, |
| "step": 30400 |
| }, |
| { |
| "epoch": 0.4554744525547445, |
| "grad_norm": 0.6244940757751465, |
| "learning_rate": 0.0001908921962934036, |
| "loss": 4.1642, |
| "step": 31200 |
| }, |
| { |
| "epoch": 0.46715328467153283, |
| "grad_norm": 0.871951162815094, |
| "learning_rate": 0.0001906586179461164, |
| "loss": 4.1358, |
| "step": 32000 |
| }, |
| { |
| "epoch": 0.47883211678832116, |
| "grad_norm": 1.751654028892517, |
| "learning_rate": 0.00019042503959882921, |
| "loss": 4.1393, |
| "step": 32800 |
| }, |
| { |
| "epoch": 0.4905109489051095, |
| "grad_norm": 0.8029406070709229, |
| "learning_rate": 0.000190191461251542, |
| "loss": 4.132, |
| "step": 33600 |
| }, |
| { |
| "epoch": 0.5021897810218978, |
| "grad_norm": 1.084448218345642, |
| "learning_rate": 0.0001899578829042548, |
| "loss": 4.1323, |
| "step": 34400 |
| }, |
| { |
| "epoch": 0.5138686131386861, |
| "grad_norm": 0.8661244511604309, |
| "learning_rate": 0.00018972430455696756, |
| "loss": 4.1388, |
| "step": 35200 |
| }, |
| { |
| "epoch": 0.5255474452554745, |
| "grad_norm": 0.7281563878059387, |
| "learning_rate": 0.0001894907262096804, |
| "loss": 4.1354, |
| "step": 36000 |
| }, |
| { |
| "epoch": 0.5372262773722628, |
| "grad_norm": 0.9080061912536621, |
| "learning_rate": 0.00018925714786239317, |
| "loss": 4.1294, |
| "step": 36800 |
| }, |
| { |
| "epoch": 0.5489051094890511, |
| "grad_norm": 0.7405328750610352, |
| "learning_rate": 0.00018902356951510597, |
| "loss": 4.1279, |
| "step": 37600 |
| }, |
| { |
| "epoch": 0.5605839416058395, |
| "grad_norm": 0.5771759748458862, |
| "learning_rate": 0.00018878999116781874, |
| "loss": 4.1094, |
| "step": 38400 |
| }, |
| { |
| "epoch": 0.5722627737226277, |
| "grad_norm": 0.7671827077865601, |
| "learning_rate": 0.00018855641282053155, |
| "loss": 4.1127, |
| "step": 39200 |
| }, |
| { |
| "epoch": 0.583941605839416, |
| "grad_norm": 0.7648681998252869, |
| "learning_rate": 0.00018832283447324435, |
| "loss": 4.1126, |
| "step": 40000 |
| }, |
| { |
| "epoch": 0.5956204379562043, |
| "grad_norm": 0.8951389789581299, |
| "learning_rate": 0.00018808925612595715, |
| "loss": 4.1117, |
| "step": 40800 |
| }, |
| { |
| "epoch": 0.6072992700729927, |
| "grad_norm": 0.6852630972862244, |
| "learning_rate": 0.00018785567777866992, |
| "loss": 4.1218, |
| "step": 41600 |
| }, |
| { |
| "epoch": 0.618978102189781, |
| "grad_norm": 0.8639559745788574, |
| "learning_rate": 0.00018762209943138272, |
| "loss": 4.1041, |
| "step": 42400 |
| }, |
| { |
| "epoch": 0.6306569343065693, |
| "grad_norm": 0.7715902924537659, |
| "learning_rate": 0.0001873885210840955, |
| "loss": 4.1104, |
| "step": 43200 |
| }, |
| { |
| "epoch": 0.6423357664233577, |
| "grad_norm": 0.9155542254447937, |
| "learning_rate": 0.00018715494273680833, |
| "loss": 4.1059, |
| "step": 44000 |
| }, |
| { |
| "epoch": 0.654014598540146, |
| "grad_norm": 0.780484676361084, |
| "learning_rate": 0.0001869213643895211, |
| "loss": 4.0958, |
| "step": 44800 |
| }, |
| { |
| "epoch": 0.6656934306569343, |
| "grad_norm": 0.6784650087356567, |
| "learning_rate": 0.0001866877860422339, |
| "loss": 4.0841, |
| "step": 45600 |
| }, |
| { |
| "epoch": 0.6773722627737226, |
| "grad_norm": 1.1621425151824951, |
| "learning_rate": 0.00018645420769494668, |
| "loss": 4.1108, |
| "step": 46400 |
| }, |
| { |
| "epoch": 0.689051094890511, |
| "grad_norm": 0.47390952706336975, |
| "learning_rate": 0.00018622062934765948, |
| "loss": 4.0979, |
| "step": 47200 |
| }, |
| { |
| "epoch": 0.7007299270072993, |
| "grad_norm": 0.690737247467041, |
| "learning_rate": 0.00018598705100037228, |
| "loss": 4.0978, |
| "step": 48000 |
| }, |
| { |
| "epoch": 0.7124087591240876, |
| "grad_norm": 1.1309623718261719, |
| "learning_rate": 0.00018575347265308508, |
| "loss": 4.097, |
| "step": 48800 |
| }, |
| { |
| "epoch": 0.724087591240876, |
| "grad_norm": 0.6210489869117737, |
| "learning_rate": 0.00018551989430579786, |
| "loss": 4.0909, |
| "step": 49600 |
| }, |
| { |
| "epoch": 0.7357664233576642, |
| "grad_norm": 0.673042356967926, |
| "learning_rate": 0.00018528631595851066, |
| "loss": 4.0867, |
| "step": 50400 |
| }, |
| { |
| "epoch": 0.7474452554744525, |
| "grad_norm": 0.5582263469696045, |
| "learning_rate": 0.00018505273761122346, |
| "loss": 4.0917, |
| "step": 51200 |
| }, |
| { |
| "epoch": 0.7591240875912408, |
| "grad_norm": 0.6824519634246826, |
| "learning_rate": 0.00018481915926393626, |
| "loss": 4.0833, |
| "step": 52000 |
| }, |
| { |
| "epoch": 0.7708029197080292, |
| "grad_norm": 0.7440599799156189, |
| "learning_rate": 0.00018458558091664904, |
| "loss": 4.0798, |
| "step": 52800 |
| }, |
| { |
| "epoch": 0.7824817518248175, |
| "grad_norm": 0.7103509902954102, |
| "learning_rate": 0.00018435200256936184, |
| "loss": 4.0903, |
| "step": 53600 |
| }, |
| { |
| "epoch": 0.7941605839416058, |
| "grad_norm": 0.9494150280952454, |
| "learning_rate": 0.0001841184242220746, |
| "loss": 4.0924, |
| "step": 54400 |
| }, |
| { |
| "epoch": 0.8058394160583942, |
| "grad_norm": 0.7308784127235413, |
| "learning_rate": 0.0001838848458747874, |
| "loss": 4.0965, |
| "step": 55200 |
| }, |
| { |
| "epoch": 0.8175182481751825, |
| "grad_norm": 0.7546706199645996, |
| "learning_rate": 0.00018365126752750021, |
| "loss": 4.0852, |
| "step": 56000 |
| }, |
| { |
| "epoch": 0.8291970802919708, |
| "grad_norm": 0.6851491928100586, |
| "learning_rate": 0.00018341768918021302, |
| "loss": 4.0805, |
| "step": 56800 |
| }, |
| { |
| "epoch": 0.8408759124087591, |
| "grad_norm": 0.6497614979743958, |
| "learning_rate": 0.0001831841108329258, |
| "loss": 4.0908, |
| "step": 57600 |
| }, |
| { |
| "epoch": 0.8525547445255475, |
| "grad_norm": 0.8901756405830383, |
| "learning_rate": 0.0001829505324856386, |
| "loss": 4.0665, |
| "step": 58400 |
| }, |
| { |
| "epoch": 0.8642335766423358, |
| "grad_norm": 0.7579403519630432, |
| "learning_rate": 0.0001827169541383514, |
| "loss": 4.0739, |
| "step": 59200 |
| }, |
| { |
| "epoch": 0.8759124087591241, |
| "grad_norm": 0.7560231685638428, |
| "learning_rate": 0.0001824833757910642, |
| "loss": 4.0696, |
| "step": 60000 |
| }, |
| { |
| "epoch": 0.8875912408759125, |
| "grad_norm": 0.9320287108421326, |
| "learning_rate": 0.00018224979744377697, |
| "loss": 4.0674, |
| "step": 60800 |
| }, |
| { |
| "epoch": 0.8992700729927007, |
| "grad_norm": 0.6470750570297241, |
| "learning_rate": 0.00018201621909648977, |
| "loss": 4.0664, |
| "step": 61600 |
| }, |
| { |
| "epoch": 0.910948905109489, |
| "grad_norm": 0.7757769227027893, |
| "learning_rate": 0.00018178264074920255, |
| "loss": 4.0676, |
| "step": 62400 |
| }, |
| { |
| "epoch": 0.9226277372262773, |
| "grad_norm": 0.9148152470588684, |
| "learning_rate": 0.00018154906240191535, |
| "loss": 4.0629, |
| "step": 63200 |
| }, |
| { |
| "epoch": 0.9343065693430657, |
| "grad_norm": 0.7893709540367126, |
| "learning_rate": 0.00018131548405462815, |
| "loss": 4.0714, |
| "step": 64000 |
| }, |
| { |
| "epoch": 0.945985401459854, |
| "grad_norm": 0.7790332436561584, |
| "learning_rate": 0.00018108190570734095, |
| "loss": 4.0618, |
| "step": 64800 |
| }, |
| { |
| "epoch": 0.9576642335766423, |
| "grad_norm": 1.025794267654419, |
| "learning_rate": 0.00018084832736005372, |
| "loss": 4.0622, |
| "step": 65600 |
| }, |
| { |
| "epoch": 0.9693430656934306, |
| "grad_norm": 0.9819601774215698, |
| "learning_rate": 0.00018061474901276653, |
| "loss": 4.056, |
| "step": 66400 |
| }, |
| { |
| "epoch": 0.981021897810219, |
| "grad_norm": 0.7966727018356323, |
| "learning_rate": 0.00018038117066547933, |
| "loss": 4.0645, |
| "step": 67200 |
| }, |
| { |
| "epoch": 0.9927007299270073, |
| "grad_norm": 0.5256748199462891, |
| "learning_rate": 0.00018014759231819213, |
| "loss": 4.0625, |
| "step": 68000 |
| }, |
| { |
| "epoch": 1.0043795620437956, |
| "grad_norm": 1.0585341453552246, |
| "learning_rate": 0.0001799140139709049, |
| "loss": 4.0299, |
| "step": 68800 |
| }, |
| { |
| "epoch": 1.0160583941605839, |
| "grad_norm": 0.568466067314148, |
| "learning_rate": 0.0001796804356236177, |
| "loss": 3.982, |
| "step": 69600 |
| }, |
| { |
| "epoch": 1.0277372262773723, |
| "grad_norm": 0.651147186756134, |
| "learning_rate": 0.00017944685727633048, |
| "loss": 3.9653, |
| "step": 70400 |
| }, |
| { |
| "epoch": 1.0394160583941605, |
| "grad_norm": 0.8875618577003479, |
| "learning_rate": 0.00017921327892904328, |
| "loss": 3.9868, |
| "step": 71200 |
| }, |
| { |
| "epoch": 1.051094890510949, |
| "grad_norm": 0.9252369999885559, |
| "learning_rate": 0.00017897970058175608, |
| "loss": 3.9729, |
| "step": 72000 |
| }, |
| { |
| "epoch": 1.0627737226277372, |
| "grad_norm": 0.5309298634529114, |
| "learning_rate": 0.00017874612223446888, |
| "loss": 3.9855, |
| "step": 72800 |
| }, |
| { |
| "epoch": 1.0744525547445256, |
| "grad_norm": 0.7743874192237854, |
| "learning_rate": 0.00017851254388718166, |
| "loss": 3.9786, |
| "step": 73600 |
| }, |
| { |
| "epoch": 1.0861313868613138, |
| "grad_norm": 0.6744789481163025, |
| "learning_rate": 0.00017827896553989446, |
| "loss": 3.9776, |
| "step": 74400 |
| }, |
| { |
| "epoch": 1.0978102189781023, |
| "grad_norm": 0.8282249569892883, |
| "learning_rate": 0.00017804538719260726, |
| "loss": 3.9781, |
| "step": 75200 |
| }, |
| { |
| "epoch": 1.1094890510948905, |
| "grad_norm": 1.0976659059524536, |
| "learning_rate": 0.00017781180884532004, |
| "loss": 3.9795, |
| "step": 76000 |
| }, |
| { |
| "epoch": 1.121167883211679, |
| "grad_norm": 0.560089647769928, |
| "learning_rate": 0.00017757823049803284, |
| "loss": 3.9812, |
| "step": 76800 |
| }, |
| { |
| "epoch": 1.1328467153284671, |
| "grad_norm": 0.9681680798530579, |
| "learning_rate": 0.00017734465215074564, |
| "loss": 3.9711, |
| "step": 77600 |
| }, |
| { |
| "epoch": 1.1445255474452556, |
| "grad_norm": 0.5735695958137512, |
| "learning_rate": 0.00017711107380345844, |
| "loss": 3.9784, |
| "step": 78400 |
| }, |
| { |
| "epoch": 1.1562043795620438, |
| "grad_norm": 0.49498119950294495, |
| "learning_rate": 0.00017687749545617121, |
| "loss": 3.9843, |
| "step": 79200 |
| }, |
| { |
| "epoch": 1.167883211678832, |
| "grad_norm": 1.0702383518218994, |
| "learning_rate": 0.00017664391710888402, |
| "loss": 3.9929, |
| "step": 80000 |
| }, |
| { |
| "epoch": 1.1795620437956205, |
| "grad_norm": 1.3075828552246094, |
| "learning_rate": 0.00017641033876159682, |
| "loss": 3.9816, |
| "step": 80800 |
| }, |
| { |
| "epoch": 1.1912408759124087, |
| "grad_norm": 1.111781120300293, |
| "learning_rate": 0.0001761767604143096, |
| "loss": 3.9792, |
| "step": 81600 |
| }, |
| { |
| "epoch": 1.2029197080291971, |
| "grad_norm": 0.9853603839874268, |
| "learning_rate": 0.0001759431820670224, |
| "loss": 3.9846, |
| "step": 82400 |
| }, |
| { |
| "epoch": 1.2145985401459853, |
| "grad_norm": 0.6544378399848938, |
| "learning_rate": 0.0001757096037197352, |
| "loss": 3.9781, |
| "step": 83200 |
| }, |
| { |
| "epoch": 1.2262773722627738, |
| "grad_norm": 1.1322039365768433, |
| "learning_rate": 0.00017547602537244797, |
| "loss": 3.9898, |
| "step": 84000 |
| }, |
| { |
| "epoch": 1.237956204379562, |
| "grad_norm": 0.7854397892951965, |
| "learning_rate": 0.00017524244702516077, |
| "loss": 3.9751, |
| "step": 84800 |
| }, |
| { |
| "epoch": 1.2496350364963504, |
| "grad_norm": 1.025718092918396, |
| "learning_rate": 0.00017500886867787357, |
| "loss": 3.9901, |
| "step": 85600 |
| }, |
| { |
| "epoch": 1.2613138686131387, |
| "grad_norm": 0.7675819993019104, |
| "learning_rate": 0.00017477529033058637, |
| "loss": 3.984, |
| "step": 86400 |
| }, |
| { |
| "epoch": 1.2729927007299269, |
| "grad_norm": 0.8988509774208069, |
| "learning_rate": 0.00017454171198329915, |
| "loss": 3.9918, |
| "step": 87200 |
| }, |
| { |
| "epoch": 1.2846715328467153, |
| "grad_norm": 1.1059536933898926, |
| "learning_rate": 0.00017430813363601195, |
| "loss": 3.9813, |
| "step": 88000 |
| }, |
| { |
| "epoch": 1.2963503649635038, |
| "grad_norm": 0.5204365253448486, |
| "learning_rate": 0.00017407455528872475, |
| "loss": 3.9744, |
| "step": 88800 |
| }, |
| { |
| "epoch": 1.308029197080292, |
| "grad_norm": 0.9890186786651611, |
| "learning_rate": 0.00017384097694143753, |
| "loss": 3.9834, |
| "step": 89600 |
| }, |
| { |
| "epoch": 1.3197080291970802, |
| "grad_norm": 1.2235816717147827, |
| "learning_rate": 0.00017360739859415033, |
| "loss": 3.993, |
| "step": 90400 |
| }, |
| { |
| "epoch": 1.3313868613138686, |
| "grad_norm": 1.0497245788574219, |
| "learning_rate": 0.00017337382024686313, |
| "loss": 3.9901, |
| "step": 91200 |
| }, |
| { |
| "epoch": 1.343065693430657, |
| "grad_norm": 0.926929235458374, |
| "learning_rate": 0.0001731402418995759, |
| "loss": 3.9742, |
| "step": 92000 |
| }, |
| { |
| "epoch": 1.3547445255474453, |
| "grad_norm": 0.7672074437141418, |
| "learning_rate": 0.0001729066635522887, |
| "loss": 3.9956, |
| "step": 92800 |
| }, |
| { |
| "epoch": 1.3664233576642335, |
| "grad_norm": 0.5817465782165527, |
| "learning_rate": 0.0001726730852050015, |
| "loss": 3.9828, |
| "step": 93600 |
| }, |
| { |
| "epoch": 1.378102189781022, |
| "grad_norm": 0.7459368109703064, |
| "learning_rate": 0.0001724395068577143, |
| "loss": 3.983, |
| "step": 94400 |
| }, |
| { |
| "epoch": 1.3897810218978102, |
| "grad_norm": 0.9725570678710938, |
| "learning_rate": 0.00017220592851042708, |
| "loss": 3.9791, |
| "step": 95200 |
| }, |
| { |
| "epoch": 1.4014598540145986, |
| "grad_norm": 1.300221562385559, |
| "learning_rate": 0.00017197235016313988, |
| "loss": 3.986, |
| "step": 96000 |
| }, |
| { |
| "epoch": 1.4131386861313868, |
| "grad_norm": 0.6552464962005615, |
| "learning_rate": 0.00017173877181585269, |
| "loss": 3.9816, |
| "step": 96800 |
| }, |
| { |
| "epoch": 1.4248175182481753, |
| "grad_norm": 1.0207213163375854, |
| "learning_rate": 0.00017150519346856546, |
| "loss": 3.9733, |
| "step": 97600 |
| }, |
| { |
| "epoch": 1.4364963503649635, |
| "grad_norm": 0.9970253109931946, |
| "learning_rate": 0.00017127161512127826, |
| "loss": 3.9834, |
| "step": 98400 |
| }, |
| { |
| "epoch": 1.448175182481752, |
| "grad_norm": 0.908315896987915, |
| "learning_rate": 0.00017103803677399106, |
| "loss": 3.9877, |
| "step": 99200 |
| }, |
| { |
| "epoch": 1.4598540145985401, |
| "grad_norm": 0.9726221561431885, |
| "learning_rate": 0.00017080445842670384, |
| "loss": 3.977, |
| "step": 100000 |
| }, |
| { |
| "epoch": 1.4715328467153284, |
| "grad_norm": 0.7048055529594421, |
| "learning_rate": 0.00017057088007941664, |
| "loss": 3.986, |
| "step": 100800 |
| }, |
| { |
| "epoch": 1.4832116788321168, |
| "grad_norm": 0.5860503911972046, |
| "learning_rate": 0.00017033730173212944, |
| "loss": 3.9723, |
| "step": 101600 |
| }, |
| { |
| "epoch": 1.4948905109489052, |
| "grad_norm": 1.0115162134170532, |
| "learning_rate": 0.00017010372338484224, |
| "loss": 3.9777, |
| "step": 102400 |
| }, |
| { |
| "epoch": 1.5065693430656935, |
| "grad_norm": 0.9691118597984314, |
| "learning_rate": 0.00016987014503755502, |
| "loss": 3.9749, |
| "step": 103200 |
| }, |
| { |
| "epoch": 1.5182481751824817, |
| "grad_norm": 0.7307090759277344, |
| "learning_rate": 0.00016963656669026782, |
| "loss": 3.98, |
| "step": 104000 |
| }, |
| { |
| "epoch": 1.5299270072992701, |
| "grad_norm": 0.9633815288543701, |
| "learning_rate": 0.00016940298834298062, |
| "loss": 3.9814, |
| "step": 104800 |
| }, |
| { |
| "epoch": 1.5416058394160586, |
| "grad_norm": 0.8716799020767212, |
| "learning_rate": 0.00016916940999569342, |
| "loss": 3.9799, |
| "step": 105600 |
| }, |
| { |
| "epoch": 1.5532846715328468, |
| "grad_norm": 1.063167691230774, |
| "learning_rate": 0.0001689358316484062, |
| "loss": 3.9828, |
| "step": 106400 |
| }, |
| { |
| "epoch": 1.564963503649635, |
| "grad_norm": 0.9315568804740906, |
| "learning_rate": 0.000168702253301119, |
| "loss": 3.9801, |
| "step": 107200 |
| }, |
| { |
| "epoch": 1.5766423357664232, |
| "grad_norm": 1.408599853515625, |
| "learning_rate": 0.00016846867495383177, |
| "loss": 3.9741, |
| "step": 108000 |
| }, |
| { |
| "epoch": 1.5883211678832116, |
| "grad_norm": 0.958906352519989, |
| "learning_rate": 0.00016823509660654457, |
| "loss": 3.9745, |
| "step": 108800 |
| }, |
| { |
| "epoch": 1.6, |
| "grad_norm": 1.124635934829712, |
| "learning_rate": 0.00016800151825925737, |
| "loss": 3.9809, |
| "step": 109600 |
| }, |
| { |
| "epoch": 1.6116788321167883, |
| "grad_norm": 0.8791921138763428, |
| "learning_rate": 0.00016776793991197018, |
| "loss": 3.9703, |
| "step": 110400 |
| }, |
| { |
| "epoch": 1.6233576642335765, |
| "grad_norm": 0.6150586009025574, |
| "learning_rate": 0.00016753436156468295, |
| "loss": 3.9944, |
| "step": 111200 |
| }, |
| { |
| "epoch": 1.635036496350365, |
| "grad_norm": 0.9286842942237854, |
| "learning_rate": 0.00016730078321739575, |
| "loss": 3.972, |
| "step": 112000 |
| }, |
| { |
| "epoch": 1.6467153284671534, |
| "grad_norm": 1.3587080240249634, |
| "learning_rate": 0.00016706720487010855, |
| "loss": 3.9649, |
| "step": 112800 |
| }, |
| { |
| "epoch": 1.6583941605839416, |
| "grad_norm": 1.1084531545639038, |
| "learning_rate": 0.00016683362652282136, |
| "loss": 3.9833, |
| "step": 113600 |
| }, |
| { |
| "epoch": 1.6700729927007298, |
| "grad_norm": 0.8050103783607483, |
| "learning_rate": 0.00016660004817553413, |
| "loss": 3.9743, |
| "step": 114400 |
| }, |
| { |
| "epoch": 1.6817518248175183, |
| "grad_norm": 0.8709130883216858, |
| "learning_rate": 0.00016636646982824693, |
| "loss": 3.9626, |
| "step": 115200 |
| }, |
| { |
| "epoch": 1.6934306569343067, |
| "grad_norm": 0.6005585789680481, |
| "learning_rate": 0.0001661328914809597, |
| "loss": 3.9638, |
| "step": 116000 |
| }, |
| { |
| "epoch": 1.705109489051095, |
| "grad_norm": 1.2463181018829346, |
| "learning_rate": 0.0001658993131336725, |
| "loss": 3.9788, |
| "step": 116800 |
| }, |
| { |
| "epoch": 1.7167883211678832, |
| "grad_norm": 1.1946378946304321, |
| "learning_rate": 0.0001656657347863853, |
| "loss": 3.9697, |
| "step": 117600 |
| }, |
| { |
| "epoch": 1.7284671532846714, |
| "grad_norm": 0.7879184484481812, |
| "learning_rate": 0.0001654321564390981, |
| "loss": 3.9621, |
| "step": 118400 |
| }, |
| { |
| "epoch": 1.7401459854014598, |
| "grad_norm": 1.1674267053604126, |
| "learning_rate": 0.00016519857809181088, |
| "loss": 3.9646, |
| "step": 119200 |
| }, |
| { |
| "epoch": 1.7518248175182483, |
| "grad_norm": 0.980387270450592, |
| "learning_rate": 0.00016496499974452369, |
| "loss": 3.972, |
| "step": 120000 |
| }, |
| { |
| "epoch": 1.7635036496350365, |
| "grad_norm": 0.9502540826797485, |
| "learning_rate": 0.0001647314213972365, |
| "loss": 3.9702, |
| "step": 120800 |
| }, |
| { |
| "epoch": 1.7751824817518247, |
| "grad_norm": 0.6683688759803772, |
| "learning_rate": 0.0001644978430499493, |
| "loss": 3.9704, |
| "step": 121600 |
| }, |
| { |
| "epoch": 1.7868613138686131, |
| "grad_norm": 1.1526950597763062, |
| "learning_rate": 0.00016426426470266206, |
| "loss": 3.9644, |
| "step": 122400 |
| }, |
| { |
| "epoch": 1.7985401459854016, |
| "grad_norm": 0.8221763968467712, |
| "learning_rate": 0.00016403068635537486, |
| "loss": 3.9677, |
| "step": 123200 |
| }, |
| { |
| "epoch": 1.8102189781021898, |
| "grad_norm": 1.111382246017456, |
| "learning_rate": 0.00016379710800808764, |
| "loss": 3.9787, |
| "step": 124000 |
| }, |
| { |
| "epoch": 1.821897810218978, |
| "grad_norm": 1.3993886709213257, |
| "learning_rate": 0.00016356352966080044, |
| "loss": 3.9848, |
| "step": 124800 |
| }, |
| { |
| "epoch": 1.8335766423357664, |
| "grad_norm": 0.9269404411315918, |
| "learning_rate": 0.00016332995131351324, |
| "loss": 3.9477, |
| "step": 125600 |
| }, |
| { |
| "epoch": 1.845255474452555, |
| "grad_norm": 0.9304395318031311, |
| "learning_rate": 0.00016309637296622604, |
| "loss": 3.9742, |
| "step": 126400 |
| }, |
| { |
| "epoch": 1.856934306569343, |
| "grad_norm": 1.1939619779586792, |
| "learning_rate": 0.00016286279461893882, |
| "loss": 3.9732, |
| "step": 127200 |
| }, |
| { |
| "epoch": 1.8686131386861313, |
| "grad_norm": 0.963022768497467, |
| "learning_rate": 0.00016262921627165162, |
| "loss": 3.9625, |
| "step": 128000 |
| }, |
| { |
| "epoch": 1.8802919708029195, |
| "grad_norm": 0.8013544082641602, |
| "learning_rate": 0.00016239563792436442, |
| "loss": 3.9648, |
| "step": 128800 |
| }, |
| { |
| "epoch": 1.891970802919708, |
| "grad_norm": 1.1415009498596191, |
| "learning_rate": 0.00016216205957707722, |
| "loss": 3.9715, |
| "step": 129600 |
| }, |
| { |
| "epoch": 1.9036496350364964, |
| "grad_norm": 0.9819127917289734, |
| "learning_rate": 0.00016192848122979, |
| "loss": 3.9624, |
| "step": 130400 |
| }, |
| { |
| "epoch": 1.9153284671532846, |
| "grad_norm": 0.7112650871276855, |
| "learning_rate": 0.0001616949028825028, |
| "loss": 3.9607, |
| "step": 131200 |
| }, |
| { |
| "epoch": 1.9270072992700729, |
| "grad_norm": 0.7825914025306702, |
| "learning_rate": 0.00016146132453521557, |
| "loss": 3.9492, |
| "step": 132000 |
| }, |
| { |
| "epoch": 1.9386861313868613, |
| "grad_norm": 1.136427402496338, |
| "learning_rate": 0.0001612277461879284, |
| "loss": 3.9607, |
| "step": 132800 |
| }, |
| { |
| "epoch": 1.9503649635036497, |
| "grad_norm": 1.004979133605957, |
| "learning_rate": 0.00016099416784064118, |
| "loss": 3.9734, |
| "step": 133600 |
| }, |
| { |
| "epoch": 1.962043795620438, |
| "grad_norm": 1.0205179452896118, |
| "learning_rate": 0.00016076058949335398, |
| "loss": 3.9629, |
| "step": 134400 |
| }, |
| { |
| "epoch": 1.9737226277372262, |
| "grad_norm": 0.6552355885505676, |
| "learning_rate": 0.00016052701114606675, |
| "loss": 3.9705, |
| "step": 135200 |
| }, |
| { |
| "epoch": 1.9854014598540146, |
| "grad_norm": 0.9943566918373108, |
| "learning_rate": 0.00016029343279877955, |
| "loss": 3.9582, |
| "step": 136000 |
| }, |
| { |
| "epoch": 1.997080291970803, |
| "grad_norm": 1.0246118307113647, |
| "learning_rate": 0.00016005985445149236, |
| "loss": 3.9668, |
| "step": 136800 |
| }, |
| { |
| "epoch": 2.0087591240875913, |
| "grad_norm": 0.783827006816864, |
| "learning_rate": 0.00015982627610420516, |
| "loss": 3.881, |
| "step": 137600 |
| }, |
| { |
| "epoch": 2.0204379562043795, |
| "grad_norm": 0.7887817025184631, |
| "learning_rate": 0.00015959269775691793, |
| "loss": 3.8402, |
| "step": 138400 |
| }, |
| { |
| "epoch": 2.0321167883211677, |
| "grad_norm": 0.9230135679244995, |
| "learning_rate": 0.00015935911940963073, |
| "loss": 3.8429, |
| "step": 139200 |
| }, |
| { |
| "epoch": 2.0437956204379564, |
| "grad_norm": 0.9154564738273621, |
| "learning_rate": 0.0001591255410623435, |
| "loss": 3.8557, |
| "step": 140000 |
| }, |
| { |
| "epoch": 2.0554744525547446, |
| "grad_norm": 1.6573781967163086, |
| "learning_rate": 0.00015889196271505634, |
| "loss": 3.8501, |
| "step": 140800 |
| }, |
| { |
| "epoch": 2.067153284671533, |
| "grad_norm": 0.9853724241256714, |
| "learning_rate": 0.0001586583843677691, |
| "loss": 3.847, |
| "step": 141600 |
| }, |
| { |
| "epoch": 2.078832116788321, |
| "grad_norm": 1.1365453004837036, |
| "learning_rate": 0.0001584248060204819, |
| "loss": 3.8616, |
| "step": 142400 |
| }, |
| { |
| "epoch": 2.0905109489051097, |
| "grad_norm": 1.0716379880905151, |
| "learning_rate": 0.00015819122767319469, |
| "loss": 3.8607, |
| "step": 143200 |
| }, |
| { |
| "epoch": 2.102189781021898, |
| "grad_norm": 0.862193763256073, |
| "learning_rate": 0.0001579576493259075, |
| "loss": 3.8566, |
| "step": 144000 |
| }, |
| { |
| "epoch": 2.113868613138686, |
| "grad_norm": 0.6875022649765015, |
| "learning_rate": 0.0001577240709786203, |
| "loss": 3.8737, |
| "step": 144800 |
| }, |
| { |
| "epoch": 2.1255474452554743, |
| "grad_norm": 0.7993234992027283, |
| "learning_rate": 0.0001574904926313331, |
| "loss": 3.8699, |
| "step": 145600 |
| }, |
| { |
| "epoch": 2.137226277372263, |
| "grad_norm": 1.0347297191619873, |
| "learning_rate": 0.00015725691428404587, |
| "loss": 3.8774, |
| "step": 146400 |
| }, |
| { |
| "epoch": 2.1489051094890512, |
| "grad_norm": 0.8707027435302734, |
| "learning_rate": 0.00015702333593675867, |
| "loss": 3.8687, |
| "step": 147200 |
| }, |
| { |
| "epoch": 2.1605839416058394, |
| "grad_norm": 1.2627824544906616, |
| "learning_rate": 0.00015678975758947144, |
| "loss": 3.8672, |
| "step": 148000 |
| }, |
| { |
| "epoch": 2.1722627737226277, |
| "grad_norm": 1.1346710920333862, |
| "learning_rate": 0.00015655617924218427, |
| "loss": 3.8797, |
| "step": 148800 |
| }, |
| { |
| "epoch": 2.183941605839416, |
| "grad_norm": 1.2239959239959717, |
| "learning_rate": 0.00015632260089489704, |
| "loss": 3.8604, |
| "step": 149600 |
| }, |
| { |
| "epoch": 2.1956204379562045, |
| "grad_norm": 1.1624715328216553, |
| "learning_rate": 0.00015608902254760985, |
| "loss": 3.8678, |
| "step": 150400 |
| }, |
| { |
| "epoch": 2.2072992700729928, |
| "grad_norm": 0.9525280594825745, |
| "learning_rate": 0.00015585544420032262, |
| "loss": 3.8617, |
| "step": 151200 |
| }, |
| { |
| "epoch": 2.218978102189781, |
| "grad_norm": 0.6676674485206604, |
| "learning_rate": 0.00015562186585303542, |
| "loss": 3.8645, |
| "step": 152000 |
| }, |
| { |
| "epoch": 2.230656934306569, |
| "grad_norm": 1.4291656017303467, |
| "learning_rate": 0.00015538828750574822, |
| "loss": 3.8721, |
| "step": 152800 |
| }, |
| { |
| "epoch": 2.242335766423358, |
| "grad_norm": 2.071485996246338, |
| "learning_rate": 0.00015515470915846102, |
| "loss": 3.881, |
| "step": 153600 |
| }, |
| { |
| "epoch": 2.254014598540146, |
| "grad_norm": 1.3130428791046143, |
| "learning_rate": 0.0001549211308111738, |
| "loss": 3.8798, |
| "step": 154400 |
| }, |
| { |
| "epoch": 2.2656934306569343, |
| "grad_norm": 1.0672556161880493, |
| "learning_rate": 0.0001546875524638866, |
| "loss": 3.8767, |
| "step": 155200 |
| }, |
| { |
| "epoch": 2.2773722627737225, |
| "grad_norm": 0.8703996539115906, |
| "learning_rate": 0.0001544539741165994, |
| "loss": 3.8696, |
| "step": 156000 |
| }, |
| { |
| "epoch": 2.289051094890511, |
| "grad_norm": 1.0338706970214844, |
| "learning_rate": 0.0001542203957693122, |
| "loss": 3.8857, |
| "step": 156800 |
| }, |
| { |
| "epoch": 2.3007299270072994, |
| "grad_norm": 0.9246997833251953, |
| "learning_rate": 0.00015398681742202498, |
| "loss": 3.884, |
| "step": 157600 |
| }, |
| { |
| "epoch": 2.3124087591240876, |
| "grad_norm": 0.7899117469787598, |
| "learning_rate": 0.00015375323907473778, |
| "loss": 3.8891, |
| "step": 158400 |
| }, |
| { |
| "epoch": 2.324087591240876, |
| "grad_norm": 0.545261800289154, |
| "learning_rate": 0.00015351966072745055, |
| "loss": 3.8702, |
| "step": 159200 |
| }, |
| { |
| "epoch": 2.335766423357664, |
| "grad_norm": 0.6720581650733948, |
| "learning_rate": 0.00015328608238016338, |
| "loss": 3.8962, |
| "step": 160000 |
| }, |
| { |
| "epoch": 2.3474452554744527, |
| "grad_norm": 0.7954906821250916, |
| "learning_rate": 0.00015305250403287616, |
| "loss": 3.8752, |
| "step": 160800 |
| }, |
| { |
| "epoch": 2.359124087591241, |
| "grad_norm": 0.6123435497283936, |
| "learning_rate": 0.00015281892568558896, |
| "loss": 3.8838, |
| "step": 161600 |
| }, |
| { |
| "epoch": 2.370802919708029, |
| "grad_norm": 1.0467143058776855, |
| "learning_rate": 0.00015258534733830173, |
| "loss": 3.8798, |
| "step": 162400 |
| }, |
| { |
| "epoch": 2.3824817518248174, |
| "grad_norm": 0.9251344799995422, |
| "learning_rate": 0.00015235176899101453, |
| "loss": 3.8801, |
| "step": 163200 |
| }, |
| { |
| "epoch": 2.394160583941606, |
| "grad_norm": 1.2078486680984497, |
| "learning_rate": 0.00015211819064372734, |
| "loss": 3.8815, |
| "step": 164000 |
| }, |
| { |
| "epoch": 2.4058394160583942, |
| "grad_norm": 1.1758290529251099, |
| "learning_rate": 0.00015188461229644014, |
| "loss": 3.8854, |
| "step": 164800 |
| }, |
| { |
| "epoch": 2.4175182481751825, |
| "grad_norm": 1.2341893911361694, |
| "learning_rate": 0.0001516510339491529, |
| "loss": 3.8736, |
| "step": 165600 |
| }, |
| { |
| "epoch": 2.4291970802919707, |
| "grad_norm": 0.85239177942276, |
| "learning_rate": 0.00015141745560186571, |
| "loss": 3.8821, |
| "step": 166400 |
| }, |
| { |
| "epoch": 2.4408759124087593, |
| "grad_norm": 0.7495572566986084, |
| "learning_rate": 0.0001511838772545785, |
| "loss": 3.8852, |
| "step": 167200 |
| }, |
| { |
| "epoch": 2.4525547445255476, |
| "grad_norm": 0.9310500621795654, |
| "learning_rate": 0.00015095029890729132, |
| "loss": 3.8904, |
| "step": 168000 |
| }, |
| { |
| "epoch": 2.4642335766423358, |
| "grad_norm": 1.059832215309143, |
| "learning_rate": 0.0001507167205600041, |
| "loss": 3.8716, |
| "step": 168800 |
| }, |
| { |
| "epoch": 2.475912408759124, |
| "grad_norm": 1.6281853914260864, |
| "learning_rate": 0.0001504831422127169, |
| "loss": 3.8841, |
| "step": 169600 |
| }, |
| { |
| "epoch": 2.487591240875912, |
| "grad_norm": 0.7431134581565857, |
| "learning_rate": 0.00015024956386542967, |
| "loss": 3.8971, |
| "step": 170400 |
| }, |
| { |
| "epoch": 2.499270072992701, |
| "grad_norm": 1.9159060716629028, |
| "learning_rate": 0.00015001598551814247, |
| "loss": 3.8938, |
| "step": 171200 |
| }, |
| { |
| "epoch": 2.510948905109489, |
| "grad_norm": 0.8651421070098877, |
| "learning_rate": 0.00014978240717085527, |
| "loss": 3.8959, |
| "step": 172000 |
| }, |
| { |
| "epoch": 2.5226277372262773, |
| "grad_norm": 0.6994857788085938, |
| "learning_rate": 0.00014954882882356807, |
| "loss": 3.9002, |
| "step": 172800 |
| }, |
| { |
| "epoch": 2.5343065693430655, |
| "grad_norm": 0.9615395069122314, |
| "learning_rate": 0.00014931525047628085, |
| "loss": 3.9101, |
| "step": 173600 |
| }, |
| { |
| "epoch": 2.5459854014598537, |
| "grad_norm": 1.0740983486175537, |
| "learning_rate": 0.00014908167212899365, |
| "loss": 3.8918, |
| "step": 174400 |
| }, |
| { |
| "epoch": 2.5576642335766424, |
| "grad_norm": 0.5677556991577148, |
| "learning_rate": 0.00014884809378170642, |
| "loss": 3.8898, |
| "step": 175200 |
| }, |
| { |
| "epoch": 2.5693430656934306, |
| "grad_norm": 0.8451770544052124, |
| "learning_rate": 0.00014861451543441925, |
| "loss": 3.8997, |
| "step": 176000 |
| }, |
| { |
| "epoch": 2.581021897810219, |
| "grad_norm": 0.9004770517349243, |
| "learning_rate": 0.00014838093708713202, |
| "loss": 3.8918, |
| "step": 176800 |
| }, |
| { |
| "epoch": 2.5927007299270075, |
| "grad_norm": 1.2636882066726685, |
| "learning_rate": 0.00014814735873984483, |
| "loss": 3.8952, |
| "step": 177600 |
| }, |
| { |
| "epoch": 2.6043795620437957, |
| "grad_norm": 1.2926832437515259, |
| "learning_rate": 0.0001479137803925576, |
| "loss": 3.8955, |
| "step": 178400 |
| }, |
| { |
| "epoch": 2.616058394160584, |
| "grad_norm": 1.2102513313293457, |
| "learning_rate": 0.00014768020204527043, |
| "loss": 3.883, |
| "step": 179200 |
| }, |
| { |
| "epoch": 2.627737226277372, |
| "grad_norm": 0.8510094285011292, |
| "learning_rate": 0.0001474466236979832, |
| "loss": 3.8857, |
| "step": 180000 |
| }, |
| { |
| "epoch": 2.6394160583941604, |
| "grad_norm": 1.0621333122253418, |
| "learning_rate": 0.000147213045350696, |
| "loss": 3.8913, |
| "step": 180800 |
| }, |
| { |
| "epoch": 2.651094890510949, |
| "grad_norm": 0.6437257528305054, |
| "learning_rate": 0.00014697946700340878, |
| "loss": 3.8867, |
| "step": 181600 |
| }, |
| { |
| "epoch": 2.6627737226277373, |
| "grad_norm": 1.1535508632659912, |
| "learning_rate": 0.00014674588865612158, |
| "loss": 3.8867, |
| "step": 182400 |
| }, |
| { |
| "epoch": 2.6744525547445255, |
| "grad_norm": 0.8997545838356018, |
| "learning_rate": 0.00014651231030883438, |
| "loss": 3.8981, |
| "step": 183200 |
| }, |
| { |
| "epoch": 2.686131386861314, |
| "grad_norm": 1.1245380640029907, |
| "learning_rate": 0.00014627873196154718, |
| "loss": 3.889, |
| "step": 184000 |
| }, |
| { |
| "epoch": 2.697810218978102, |
| "grad_norm": 1.0414170026779175, |
| "learning_rate": 0.00014604515361425996, |
| "loss": 3.8838, |
| "step": 184800 |
| }, |
| { |
| "epoch": 2.7094890510948906, |
| "grad_norm": 0.8895809054374695, |
| "learning_rate": 0.00014581157526697276, |
| "loss": 3.898, |
| "step": 185600 |
| }, |
| { |
| "epoch": 2.721167883211679, |
| "grad_norm": 0.9932270050048828, |
| "learning_rate": 0.00014557799691968553, |
| "loss": 3.8895, |
| "step": 186400 |
| }, |
| { |
| "epoch": 2.732846715328467, |
| "grad_norm": 0.9618707895278931, |
| "learning_rate": 0.00014534441857239836, |
| "loss": 3.8977, |
| "step": 187200 |
| }, |
| { |
| "epoch": 2.7445255474452557, |
| "grad_norm": 0.6126194000244141, |
| "learning_rate": 0.00014511084022511114, |
| "loss": 3.8931, |
| "step": 188000 |
| }, |
| { |
| "epoch": 2.756204379562044, |
| "grad_norm": 0.9863024353981018, |
| "learning_rate": 0.00014487726187782394, |
| "loss": 3.883, |
| "step": 188800 |
| }, |
| { |
| "epoch": 2.767883211678832, |
| "grad_norm": 0.9130956530570984, |
| "learning_rate": 0.00014464368353053671, |
| "loss": 3.8944, |
| "step": 189600 |
| }, |
| { |
| "epoch": 2.7795620437956203, |
| "grad_norm": 1.2602386474609375, |
| "learning_rate": 0.00014441010518324952, |
| "loss": 3.8864, |
| "step": 190400 |
| }, |
| { |
| "epoch": 2.7912408759124085, |
| "grad_norm": 0.5719624161720276, |
| "learning_rate": 0.00014417652683596232, |
| "loss": 3.8885, |
| "step": 191200 |
| }, |
| { |
| "epoch": 2.802919708029197, |
| "grad_norm": 0.846307635307312, |
| "learning_rate": 0.00014394294848867512, |
| "loss": 3.9102, |
| "step": 192000 |
| }, |
| { |
| "epoch": 2.8145985401459854, |
| "grad_norm": 1.140491247177124, |
| "learning_rate": 0.0001437093701413879, |
| "loss": 3.9015, |
| "step": 192800 |
| }, |
| { |
| "epoch": 2.8262773722627736, |
| "grad_norm": 0.8926886320114136, |
| "learning_rate": 0.0001434757917941007, |
| "loss": 3.8829, |
| "step": 193600 |
| }, |
| { |
| "epoch": 2.8379562043795623, |
| "grad_norm": 0.9810717105865479, |
| "learning_rate": 0.00014324221344681347, |
| "loss": 3.8978, |
| "step": 194400 |
| }, |
| { |
| "epoch": 2.8496350364963505, |
| "grad_norm": 1.0074169635772705, |
| "learning_rate": 0.0001430086350995263, |
| "loss": 3.8919, |
| "step": 195200 |
| }, |
| { |
| "epoch": 2.8613138686131387, |
| "grad_norm": 0.9317566156387329, |
| "learning_rate": 0.00014277505675223907, |
| "loss": 3.9084, |
| "step": 196000 |
| }, |
| { |
| "epoch": 2.872992700729927, |
| "grad_norm": 1.5248185396194458, |
| "learning_rate": 0.00014254147840495187, |
| "loss": 3.8973, |
| "step": 196800 |
| }, |
| { |
| "epoch": 2.884671532846715, |
| "grad_norm": 0.8181623816490173, |
| "learning_rate": 0.00014230790005766465, |
| "loss": 3.8989, |
| "step": 197600 |
| }, |
| { |
| "epoch": 2.896350364963504, |
| "grad_norm": 1.0604304075241089, |
| "learning_rate": 0.00014207432171037745, |
| "loss": 3.8824, |
| "step": 198400 |
| }, |
| { |
| "epoch": 2.908029197080292, |
| "grad_norm": 1.1073662042617798, |
| "learning_rate": 0.00014184074336309025, |
| "loss": 3.883, |
| "step": 199200 |
| }, |
| { |
| "epoch": 2.9197080291970803, |
| "grad_norm": 1.4300545454025269, |
| "learning_rate": 0.00014160716501580305, |
| "loss": 3.8935, |
| "step": 200000 |
| }, |
| { |
| "epoch": 2.9313868613138685, |
| "grad_norm": 0.9314271807670593, |
| "learning_rate": 0.00014137358666851583, |
| "loss": 3.8885, |
| "step": 200800 |
| }, |
| { |
| "epoch": 2.9430656934306567, |
| "grad_norm": 1.5918676853179932, |
| "learning_rate": 0.00014114000832122863, |
| "loss": 3.8834, |
| "step": 201600 |
| }, |
| { |
| "epoch": 2.9547445255474454, |
| "grad_norm": 0.9951960444450378, |
| "learning_rate": 0.0001409064299739414, |
| "loss": 3.8937, |
| "step": 202400 |
| }, |
| { |
| "epoch": 2.9664233576642336, |
| "grad_norm": 1.1108194589614868, |
| "learning_rate": 0.00014067285162665423, |
| "loss": 3.8979, |
| "step": 203200 |
| }, |
| { |
| "epoch": 2.978102189781022, |
| "grad_norm": 1.1282997131347656, |
| "learning_rate": 0.000140439273279367, |
| "loss": 3.8962, |
| "step": 204000 |
| }, |
| { |
| "epoch": 2.9897810218978105, |
| "grad_norm": 1.1506567001342773, |
| "learning_rate": 0.0001402056949320798, |
| "loss": 3.8997, |
| "step": 204800 |
| }, |
| { |
| "epoch": 3.0014598540145987, |
| "grad_norm": 1.0932163000106812, |
| "learning_rate": 0.00013997211658479258, |
| "loss": 3.8628, |
| "step": 205600 |
| }, |
| { |
| "epoch": 3.013138686131387, |
| "grad_norm": 1.4452706575393677, |
| "learning_rate": 0.0001397385382375054, |
| "loss": 3.7579, |
| "step": 206400 |
| }, |
| { |
| "epoch": 3.024817518248175, |
| "grad_norm": 1.5414268970489502, |
| "learning_rate": 0.00013950495989021818, |
| "loss": 3.7585, |
| "step": 207200 |
| }, |
| { |
| "epoch": 3.0364963503649633, |
| "grad_norm": 0.8187559843063354, |
| "learning_rate": 0.00013927138154293099, |
| "loss": 3.7596, |
| "step": 208000 |
| }, |
| { |
| "epoch": 3.048175182481752, |
| "grad_norm": 1.4612154960632324, |
| "learning_rate": 0.00013903780319564376, |
| "loss": 3.7583, |
| "step": 208800 |
| }, |
| { |
| "epoch": 3.0598540145985402, |
| "grad_norm": 0.7291022539138794, |
| "learning_rate": 0.00013880422484835656, |
| "loss": 3.753, |
| "step": 209600 |
| }, |
| { |
| "epoch": 3.0715328467153284, |
| "grad_norm": 0.9759907126426697, |
| "learning_rate": 0.00013857064650106936, |
| "loss": 3.7531, |
| "step": 210400 |
| }, |
| { |
| "epoch": 3.0832116788321167, |
| "grad_norm": 0.8981759548187256, |
| "learning_rate": 0.00013833706815378217, |
| "loss": 3.7598, |
| "step": 211200 |
| }, |
| { |
| "epoch": 3.094890510948905, |
| "grad_norm": 0.9674969911575317, |
| "learning_rate": 0.00013810348980649494, |
| "loss": 3.7593, |
| "step": 212000 |
| }, |
| { |
| "epoch": 3.1065693430656935, |
| "grad_norm": 1.410812497138977, |
| "learning_rate": 0.00013786991145920774, |
| "loss": 3.7674, |
| "step": 212800 |
| }, |
| { |
| "epoch": 3.1182481751824818, |
| "grad_norm": 1.0926766395568848, |
| "learning_rate": 0.00013763633311192052, |
| "loss": 3.7617, |
| "step": 213600 |
| }, |
| { |
| "epoch": 3.12992700729927, |
| "grad_norm": 0.9864111542701721, |
| "learning_rate": 0.00013740275476463334, |
| "loss": 3.7752, |
| "step": 214400 |
| }, |
| { |
| "epoch": 3.141605839416058, |
| "grad_norm": 0.8889511823654175, |
| "learning_rate": 0.00013716917641734612, |
| "loss": 3.7727, |
| "step": 215200 |
| }, |
| { |
| "epoch": 3.153284671532847, |
| "grad_norm": 0.8842699527740479, |
| "learning_rate": 0.00013693559807005892, |
| "loss": 3.7694, |
| "step": 216000 |
| }, |
| { |
| "epoch": 3.164963503649635, |
| "grad_norm": 1.7299695014953613, |
| "learning_rate": 0.0001367020197227717, |
| "loss": 3.7738, |
| "step": 216800 |
| }, |
| { |
| "epoch": 3.1766423357664233, |
| "grad_norm": 0.9839622974395752, |
| "learning_rate": 0.0001364684413754845, |
| "loss": 3.7947, |
| "step": 217600 |
| }, |
| { |
| "epoch": 3.1883211678832115, |
| "grad_norm": 1.3334280252456665, |
| "learning_rate": 0.0001362348630281973, |
| "loss": 3.7778, |
| "step": 218400 |
| }, |
| { |
| "epoch": 3.2, |
| "grad_norm": 1.234206199645996, |
| "learning_rate": 0.0001360012846809101, |
| "loss": 3.7756, |
| "step": 219200 |
| }, |
| { |
| "epoch": 3.2116788321167884, |
| "grad_norm": 0.7193965315818787, |
| "learning_rate": 0.00013576770633362287, |
| "loss": 3.7879, |
| "step": 220000 |
| }, |
| { |
| "epoch": 3.2233576642335766, |
| "grad_norm": 1.1315131187438965, |
| "learning_rate": 0.00013553412798633568, |
| "loss": 3.7775, |
| "step": 220800 |
| }, |
| { |
| "epoch": 3.235036496350365, |
| "grad_norm": 1.6959398984909058, |
| "learning_rate": 0.00013530054963904845, |
| "loss": 3.7813, |
| "step": 221600 |
| }, |
| { |
| "epoch": 3.246715328467153, |
| "grad_norm": 2.144179344177246, |
| "learning_rate": 0.00013506697129176128, |
| "loss": 3.7948, |
| "step": 222400 |
| }, |
| { |
| "epoch": 3.2583941605839417, |
| "grad_norm": 1.4156116247177124, |
| "learning_rate": 0.00013483339294447405, |
| "loss": 3.7894, |
| "step": 223200 |
| }, |
| { |
| "epoch": 3.27007299270073, |
| "grad_norm": 0.8479212522506714, |
| "learning_rate": 0.00013459981459718685, |
| "loss": 3.8035, |
| "step": 224000 |
| }, |
| { |
| "epoch": 3.281751824817518, |
| "grad_norm": 0.8472751975059509, |
| "learning_rate": 0.00013436623624989963, |
| "loss": 3.8009, |
| "step": 224800 |
| }, |
| { |
| "epoch": 3.293430656934307, |
| "grad_norm": 1.2888227701187134, |
| "learning_rate": 0.00013413265790261243, |
| "loss": 3.7939, |
| "step": 225600 |
| }, |
| { |
| "epoch": 3.305109489051095, |
| "grad_norm": 1.1597789525985718, |
| "learning_rate": 0.00013389907955532523, |
| "loss": 3.7926, |
| "step": 226400 |
| }, |
| { |
| "epoch": 3.3167883211678832, |
| "grad_norm": 0.7779558300971985, |
| "learning_rate": 0.00013366550120803803, |
| "loss": 3.8011, |
| "step": 227200 |
| }, |
| { |
| "epoch": 3.3284671532846715, |
| "grad_norm": 0.9646685719490051, |
| "learning_rate": 0.0001334319228607508, |
| "loss": 3.7926, |
| "step": 228000 |
| }, |
| { |
| "epoch": 3.3401459854014597, |
| "grad_norm": 0.9660009741783142, |
| "learning_rate": 0.0001331983445134636, |
| "loss": 3.802, |
| "step": 228800 |
| }, |
| { |
| "epoch": 3.3518248175182483, |
| "grad_norm": 1.1353583335876465, |
| "learning_rate": 0.00013296476616617638, |
| "loss": 3.8077, |
| "step": 229600 |
| }, |
| { |
| "epoch": 3.3635036496350366, |
| "grad_norm": 0.976076602935791, |
| "learning_rate": 0.0001327311878188892, |
| "loss": 3.7869, |
| "step": 230400 |
| }, |
| { |
| "epoch": 3.375182481751825, |
| "grad_norm": 1.091763973236084, |
| "learning_rate": 0.000132497609471602, |
| "loss": 3.7933, |
| "step": 231200 |
| }, |
| { |
| "epoch": 3.386861313868613, |
| "grad_norm": 1.6800352334976196, |
| "learning_rate": 0.0001322640311243148, |
| "loss": 3.7964, |
| "step": 232000 |
| }, |
| { |
| "epoch": 3.398540145985401, |
| "grad_norm": 1.7834371328353882, |
| "learning_rate": 0.00013203045277702756, |
| "loss": 3.8052, |
| "step": 232800 |
| }, |
| { |
| "epoch": 3.41021897810219, |
| "grad_norm": 1.1066137552261353, |
| "learning_rate": 0.0001317968744297404, |
| "loss": 3.8186, |
| "step": 233600 |
| }, |
| { |
| "epoch": 3.421897810218978, |
| "grad_norm": 1.4821542501449585, |
| "learning_rate": 0.00013156329608245317, |
| "loss": 3.8012, |
| "step": 234400 |
| }, |
| { |
| "epoch": 3.4335766423357663, |
| "grad_norm": 1.6962345838546753, |
| "learning_rate": 0.00013132971773516597, |
| "loss": 3.7981, |
| "step": 235200 |
| }, |
| { |
| "epoch": 3.445255474452555, |
| "grad_norm": 1.6591582298278809, |
| "learning_rate": 0.00013109613938787874, |
| "loss": 3.8167, |
| "step": 236000 |
| }, |
| { |
| "epoch": 3.456934306569343, |
| "grad_norm": 0.7325506806373596, |
| "learning_rate": 0.00013086256104059154, |
| "loss": 3.8046, |
| "step": 236800 |
| }, |
| { |
| "epoch": 3.4686131386861314, |
| "grad_norm": 1.017753005027771, |
| "learning_rate": 0.00013062898269330434, |
| "loss": 3.8098, |
| "step": 237600 |
| }, |
| { |
| "epoch": 3.4802919708029196, |
| "grad_norm": 1.0426437854766846, |
| "learning_rate": 0.00013039540434601715, |
| "loss": 3.8045, |
| "step": 238400 |
| }, |
| { |
| "epoch": 3.491970802919708, |
| "grad_norm": 0.8641120791435242, |
| "learning_rate": 0.00013016182599872992, |
| "loss": 3.8202, |
| "step": 239200 |
| }, |
| { |
| "epoch": 3.5036496350364965, |
| "grad_norm": 0.7680474519729614, |
| "learning_rate": 0.00012992824765144272, |
| "loss": 3.8116, |
| "step": 240000 |
| }, |
| { |
| "epoch": 3.5153284671532847, |
| "grad_norm": 0.8205093741416931, |
| "learning_rate": 0.0001296946693041555, |
| "loss": 3.8002, |
| "step": 240800 |
| }, |
| { |
| "epoch": 3.527007299270073, |
| "grad_norm": 1.1150528192520142, |
| "learning_rate": 0.00012946109095686833, |
| "loss": 3.8154, |
| "step": 241600 |
| }, |
| { |
| "epoch": 3.538686131386861, |
| "grad_norm": 0.9264869689941406, |
| "learning_rate": 0.0001292275126095811, |
| "loss": 3.7957, |
| "step": 242400 |
| }, |
| { |
| "epoch": 3.5503649635036494, |
| "grad_norm": 0.9504124522209167, |
| "learning_rate": 0.0001289939342622939, |
| "loss": 3.8059, |
| "step": 243200 |
| }, |
| { |
| "epoch": 3.562043795620438, |
| "grad_norm": 0.6638396382331848, |
| "learning_rate": 0.00012876035591500668, |
| "loss": 3.8171, |
| "step": 244000 |
| }, |
| { |
| "epoch": 3.5737226277372263, |
| "grad_norm": 0.5771734118461609, |
| "learning_rate": 0.00012852677756771948, |
| "loss": 3.8241, |
| "step": 244800 |
| }, |
| { |
| "epoch": 3.5854014598540145, |
| "grad_norm": 0.9084689617156982, |
| "learning_rate": 0.00012829319922043228, |
| "loss": 3.8075, |
| "step": 245600 |
| }, |
| { |
| "epoch": 3.597080291970803, |
| "grad_norm": 1.1063374280929565, |
| "learning_rate": 0.00012805962087314508, |
| "loss": 3.8197, |
| "step": 246400 |
| }, |
| { |
| "epoch": 3.6087591240875914, |
| "grad_norm": 0.9490681886672974, |
| "learning_rate": 0.00012782604252585785, |
| "loss": 3.8158, |
| "step": 247200 |
| }, |
| { |
| "epoch": 3.6204379562043796, |
| "grad_norm": 0.771484375, |
| "learning_rate": 0.00012759246417857066, |
| "loss": 3.8136, |
| "step": 248000 |
| }, |
| { |
| "epoch": 3.632116788321168, |
| "grad_norm": 1.1464002132415771, |
| "learning_rate": 0.00012735888583128343, |
| "loss": 3.8167, |
| "step": 248800 |
| }, |
| { |
| "epoch": 3.643795620437956, |
| "grad_norm": 1.292195200920105, |
| "learning_rate": 0.00012712530748399626, |
| "loss": 3.8133, |
| "step": 249600 |
| }, |
| { |
| "epoch": 3.6554744525547447, |
| "grad_norm": 1.0379976034164429, |
| "learning_rate": 0.00012689172913670903, |
| "loss": 3.8102, |
| "step": 250400 |
| }, |
| { |
| "epoch": 3.667153284671533, |
| "grad_norm": 1.7028378248214722, |
| "learning_rate": 0.00012665815078942183, |
| "loss": 3.8138, |
| "step": 251200 |
| }, |
| { |
| "epoch": 3.678832116788321, |
| "grad_norm": 1.4890276193618774, |
| "learning_rate": 0.0001264245724421346, |
| "loss": 3.8195, |
| "step": 252000 |
| }, |
| { |
| "epoch": 3.6905109489051093, |
| "grad_norm": 1.1416970491409302, |
| "learning_rate": 0.0001261909940948474, |
| "loss": 3.8083, |
| "step": 252800 |
| }, |
| { |
| "epoch": 3.7021897810218976, |
| "grad_norm": 1.3536219596862793, |
| "learning_rate": 0.0001259574157475602, |
| "loss": 3.8155, |
| "step": 253600 |
| }, |
| { |
| "epoch": 3.713868613138686, |
| "grad_norm": 0.939917266368866, |
| "learning_rate": 0.00012572383740027301, |
| "loss": 3.8224, |
| "step": 254400 |
| }, |
| { |
| "epoch": 3.7255474452554744, |
| "grad_norm": 0.570955753326416, |
| "learning_rate": 0.0001254902590529858, |
| "loss": 3.8202, |
| "step": 255200 |
| }, |
| { |
| "epoch": 3.7372262773722627, |
| "grad_norm": 1.467022180557251, |
| "learning_rate": 0.0001252566807056986, |
| "loss": 3.8217, |
| "step": 256000 |
| }, |
| { |
| "epoch": 3.7489051094890513, |
| "grad_norm": 0.7063941955566406, |
| "learning_rate": 0.00012502310235841136, |
| "loss": 3.8166, |
| "step": 256800 |
| }, |
| { |
| "epoch": 3.7605839416058395, |
| "grad_norm": 1.1569101810455322, |
| "learning_rate": 0.0001247895240111242, |
| "loss": 3.8151, |
| "step": 257600 |
| }, |
| { |
| "epoch": 3.7722627737226277, |
| "grad_norm": 1.2285373210906982, |
| "learning_rate": 0.00012455594566383697, |
| "loss": 3.8214, |
| "step": 258400 |
| }, |
| { |
| "epoch": 3.783941605839416, |
| "grad_norm": 0.9570793509483337, |
| "learning_rate": 0.00012432236731654977, |
| "loss": 3.8126, |
| "step": 259200 |
| }, |
| { |
| "epoch": 3.795620437956204, |
| "grad_norm": 0.7642357349395752, |
| "learning_rate": 0.00012408878896926254, |
| "loss": 3.8039, |
| "step": 260000 |
| }, |
| { |
| "epoch": 3.807299270072993, |
| "grad_norm": 1.2175133228302002, |
| "learning_rate": 0.00012385521062197537, |
| "loss": 3.8103, |
| "step": 260800 |
| }, |
| { |
| "epoch": 3.818978102189781, |
| "grad_norm": 0.6660974025726318, |
| "learning_rate": 0.00012362163227468815, |
| "loss": 3.8113, |
| "step": 261600 |
| }, |
| { |
| "epoch": 3.8306569343065693, |
| "grad_norm": 1.5753804445266724, |
| "learning_rate": 0.00012338805392740095, |
| "loss": 3.8047, |
| "step": 262400 |
| }, |
| { |
| "epoch": 3.8423357664233575, |
| "grad_norm": 0.9252421259880066, |
| "learning_rate": 0.00012315447558011372, |
| "loss": 3.8209, |
| "step": 263200 |
| }, |
| { |
| "epoch": 3.8540145985401457, |
| "grad_norm": 1.3272552490234375, |
| "learning_rate": 0.00012292089723282652, |
| "loss": 3.8153, |
| "step": 264000 |
| } |
| ], |
| "logging_steps": 800, |
| "max_steps": 685000, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 10, |
| "save_steps": 4000, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.2911936164773396e+18, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|