[ { "step": 1405, "epoch": 0.3977634652133909, "wallclock": "2026-05-23T01:59:19.731727", "loss": 0.1505, "grad_norm": 0.860001266002655, "learning_rate": 9.27049077439764e-06 }, { "step": 1410, "epoch": 0.39917899355934605, "wallclock": "2026-05-23T02:01:12.843146", "loss": 0.137, "grad_norm": 0.8940677642822266, "learning_rate": 9.264517869578343e-06, "step_time_sec": 113.11 }, { "step": 1415, "epoch": 0.40059452190530115, "wallclock": "2026-05-23T02:03:05.328968", "loss": 0.1519, "grad_norm": 0.8663320541381836, "learning_rate": 9.258522553059383e-06, "step_time_sec": 112.49 }, { "step": 1420, "epoch": 0.4020100502512563, "wallclock": "2026-05-23T02:04:58.249080", "loss": 0.1329, "grad_norm": 0.8876581192016602, "learning_rate": 9.252504856348483e-06, "step_time_sec": 112.92 }, { "step": 1425, "epoch": 0.4034255785972114, "wallclock": "2026-05-23T02:06:50.925025", "loss": 0.1339, "grad_norm": 0.7425838708877563, "learning_rate": 9.246464811070978e-06, "step_time_sec": 112.68 }, { "step": 1430, "epoch": 0.40484110694316655, "wallclock": "2026-05-23T02:08:44.125444", "loss": 0.1263, "grad_norm": 0.8344400525093079, "learning_rate": 9.240402448969655e-06, "step_time_sec": 113.2 }, { "step": 1435, "epoch": 0.40625663528912165, "wallclock": "2026-05-23T02:10:37.926903", "loss": 0.1374, "grad_norm": 0.920082688331604, "learning_rate": 9.234317801904584e-06, "step_time_sec": 113.8 }, { "step": 1440, "epoch": 0.4076721636350768, "wallclock": "2026-05-23T02:12:30.509342", "loss": 0.1522, "grad_norm": 0.9682347178459167, "learning_rate": 9.228210901852953e-06, "step_time_sec": 112.58 }, { "step": 1445, "epoch": 0.4090876919810319, "wallclock": "2026-05-23T02:14:22.744101", "loss": 0.1435, "grad_norm": 0.8033989667892456, "learning_rate": 9.222081780908894e-06, "step_time_sec": 112.23 }, { "step": 1450, "epoch": 0.41050322032698705, "wallclock": "2026-05-23T02:16:16.036698", "loss": 0.132, "grad_norm": 1.0462369918823242, "learning_rate": 9.215930471283323e-06, "step_time_sec": 113.29, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 64.34 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1455, "epoch": 0.4119187486729422, "wallclock": "2026-05-23T02:18:08.338694", "loss": 0.1657, "grad_norm": 2.268519401550293, "learning_rate": 9.209757005303761e-06, "step_time_sec": 112.3 }, { "step": 1460, "epoch": 0.4133342770188973, "wallclock": "2026-05-23T02:20:01.532146", "loss": 0.1165, "grad_norm": 0.7390187978744507, "learning_rate": 9.203561415414174e-06, "step_time_sec": 113.19 }, { "step": 1465, "epoch": 0.41474980536485245, "wallclock": "2026-05-23T02:21:55.232651", "loss": 0.12, "grad_norm": 0.884283721446991, "learning_rate": 9.197343734174798e-06, "step_time_sec": 113.7 }, { "step": 1470, "epoch": 0.41616533371080755, "wallclock": "2026-05-23T02:23:47.669724", "loss": 0.1227, "grad_norm": 0.7426964640617371, "learning_rate": 9.191103994261963e-06, "step_time_sec": 112.44 }, { "step": 1475, "epoch": 0.4175808620567627, "wallclock": "2026-05-23T02:25:40.551477", "loss": 0.1423, "grad_norm": 1.1171990633010864, "learning_rate": 9.184842228467929e-06, "step_time_sec": 112.88 }, { "step": 1480, "epoch": 0.4189963904027178, "wallclock": "2026-05-23T02:27:34.235355", "loss": 0.1356, "grad_norm": 1.0424611568450928, "learning_rate": 9.178558469700712e-06, "step_time_sec": 113.68 }, { "step": 1485, "epoch": 0.42041191874867295, "wallclock": "2026-05-23T02:29:26.581237", "loss": 0.1192, "grad_norm": 0.7916944026947021, "learning_rate": 9.172252750983904e-06, "step_time_sec": 112.35 }, { "step": 1490, "epoch": 0.42182744709462805, "wallclock": "2026-05-23T02:31:19.100873", "loss": 0.1178, "grad_norm": 0.6911448240280151, "learning_rate": 9.165925105456513e-06, "step_time_sec": 112.52 }, { "step": 1495, "epoch": 0.4232429754405832, "wallclock": "2026-05-23T02:33:12.432128", "loss": 0.1268, "grad_norm": 1.207095980644226, "learning_rate": 9.159575566372774e-06, "step_time_sec": 113.33 }, { "step": 1500, "epoch": 0.4246585037865383, "wallclock": "2026-05-23T02:35:05.236376", "loss": 0.1249, "grad_norm": 0.8602229952812195, "learning_rate": 9.153204167101984e-06, "step_time_sec": 112.8, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 65.95 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1500, "epoch": 0.4246585037865383, "wallclock": "2026-05-23T02:35:56.386847", "eval_loss": 0.14635811746120453, "eval_runtime": 51.064, "eval_samples_per_second": 4.896, "eval_steps_per_second": 1.234, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 65.95 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1505, "epoch": 0.42607403213249345, "wallclock": "2026-05-23T02:39:31.314975", "loss": 0.132, "grad_norm": 0.9033521413803101, "learning_rate": 9.146810941128326e-06, "step_time_sec": 266.08 }, { "step": 1510, "epoch": 0.4274895604784486, "wallclock": "2026-05-23T02:41:24.639692", "loss": 0.1235, "grad_norm": 0.9021329879760742, "learning_rate": 9.140395922050687e-06, "step_time_sec": 113.32 }, { "step": 1515, "epoch": 0.4289050888244037, "wallclock": "2026-05-23T02:43:21.636680", "loss": 0.1443, "grad_norm": 0.8108121752738953, "learning_rate": 9.133959143582485e-06, "step_time_sec": 117.0 }, { "step": 1520, "epoch": 0.43032061717035885, "wallclock": "2026-05-23T02:45:14.801586", "loss": 0.1256, "grad_norm": 0.9193041920661926, "learning_rate": 9.127500639551497e-06, "step_time_sec": 113.16 }, { "step": 1525, "epoch": 0.43173614551631395, "wallclock": "2026-05-23T02:47:07.650420", "loss": 0.1356, "grad_norm": 0.8465185761451721, "learning_rate": 9.12102044389967e-06, "step_time_sec": 112.85 }, { "step": 1530, "epoch": 0.4331516738622691, "wallclock": "2026-05-23T02:49:00.408689", "loss": 0.1384, "grad_norm": 0.973936140537262, "learning_rate": 9.114518590682955e-06, "step_time_sec": 112.76 }, { "step": 1535, "epoch": 0.4345672022082242, "wallclock": "2026-05-23T02:50:52.832459", "loss": 0.1274, "grad_norm": 1.2166610956192017, "learning_rate": 9.107995114071116e-06, "step_time_sec": 112.42 }, { "step": 1540, "epoch": 0.43598273055417935, "wallclock": "2026-05-23T02:52:44.842922", "loss": 0.122, "grad_norm": 0.985847532749176, "learning_rate": 9.101450048347562e-06, "step_time_sec": 112.01 }, { "step": 1545, "epoch": 0.43739825890013445, "wallclock": "2026-05-23T02:54:38.307969", "loss": 0.1365, "grad_norm": 0.7600606083869934, "learning_rate": 9.094883427909156e-06, "step_time_sec": 113.47 }, { "step": 1550, "epoch": 0.4388137872460896, "wallclock": "2026-05-23T02:56:31.349254", "loss": 0.1379, "grad_norm": 0.7994720339775085, "learning_rate": 9.088295287266042e-06, "step_time_sec": 113.04, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1555, "epoch": 0.44022931559204476, "wallclock": "2026-05-23T02:58:24.231104", "loss": 0.1325, "grad_norm": 0.8235543370246887, "learning_rate": 9.081685661041463e-06, "step_time_sec": 112.88 }, { "step": 1560, "epoch": 0.44164484393799985, "wallclock": "2026-05-23T03:00:19.009451", "loss": 0.1112, "grad_norm": 1.33493173122406, "learning_rate": 9.075054583971575e-06, "step_time_sec": 114.78 }, { "step": 1565, "epoch": 0.443060372283955, "wallclock": "2026-05-23T03:02:11.720414", "loss": 0.1274, "grad_norm": 0.6676927804946899, "learning_rate": 9.068402090905263e-06, "step_time_sec": 112.71 }, { "step": 1570, "epoch": 0.4444759006299101, "wallclock": "2026-05-23T03:04:04.443929", "loss": 0.1158, "grad_norm": 2.0362584590911865, "learning_rate": 9.06172821680397e-06, "step_time_sec": 112.72 }, { "step": 1575, "epoch": 0.44589142897586526, "wallclock": "2026-05-23T03:05:58.111861", "loss": 0.1459, "grad_norm": 0.8041182160377502, "learning_rate": 9.055032996741492e-06, "step_time_sec": 113.67 }, { "step": 1580, "epoch": 0.44730695732182035, "wallclock": "2026-05-23T03:07:51.100629", "loss": 0.1209, "grad_norm": 0.6887193918228149, "learning_rate": 9.048316465903823e-06, "step_time_sec": 112.99 }, { "step": 1585, "epoch": 0.4487224856677755, "wallclock": "2026-05-23T03:09:44.719059", "loss": 0.1472, "grad_norm": 0.9417322278022766, "learning_rate": 9.041578659588938e-06, "step_time_sec": 113.62 }, { "step": 1590, "epoch": 0.4501380140137306, "wallclock": "2026-05-23T03:11:39.177916", "loss": 0.1198, "grad_norm": 0.7076205611228943, "learning_rate": 9.034819613206631e-06, "step_time_sec": 114.46 }, { "step": 1595, "epoch": 0.45155354235968576, "wallclock": "2026-05-23T03:13:32.601273", "loss": 0.1576, "grad_norm": 0.8126243948936462, "learning_rate": 9.028039362278318e-06, "step_time_sec": 113.42 }, { "step": 1600, "epoch": 0.45296907070564085, "wallclock": "2026-05-23T03:15:25.341230", "loss": 0.1392, "grad_norm": 0.8675165176391602, "learning_rate": 9.021237942436855e-06, "step_time_sec": 112.74, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1600, "epoch": 0.45296907070564085, "wallclock": "2026-05-23T03:16:17.416846", "eval_loss": 0.14519159495830536, "eval_runtime": 51.9828, "eval_samples_per_second": 4.809, "eval_steps_per_second": 1.212, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1605, "epoch": 0.454384599051596, "wallclock": "2026-05-23T03:19:54.242069", "loss": 0.1252, "grad_norm": 0.7624632716178894, "learning_rate": 9.01441538942635e-06, "step_time_sec": 268.9 }, { "step": 1610, "epoch": 0.45580012739755116, "wallclock": "2026-05-23T03:21:47.588042", "loss": 0.1305, "grad_norm": 0.5635123252868652, "learning_rate": 9.007571739101968e-06, "step_time_sec": 113.35 }, { "step": 1615, "epoch": 0.45721565574350626, "wallclock": "2026-05-23T03:23:41.226600", "loss": 0.1117, "grad_norm": 0.7951876521110535, "learning_rate": 9.000707027429757e-06, "step_time_sec": 113.64 }, { "step": 1620, "epoch": 0.4586311840894614, "wallclock": "2026-05-23T03:25:36.047456", "loss": 0.1283, "grad_norm": 1.121505618095398, "learning_rate": 8.993821290486442e-06, "step_time_sec": 114.82 }, { "step": 1625, "epoch": 0.4600467124354165, "wallclock": "2026-05-23T03:27:30.028714", "loss": 0.1127, "grad_norm": 0.9441781640052795, "learning_rate": 8.98691456445925e-06, "step_time_sec": 113.98 }, { "step": 1630, "epoch": 0.46146224078137166, "wallclock": "2026-05-23T03:29:23.551875", "loss": 0.1246, "grad_norm": 0.8297203779220581, "learning_rate": 8.979986885645712e-06, "step_time_sec": 113.52 }, { "step": 1635, "epoch": 0.46287776912732675, "wallclock": "2026-05-23T03:31:17.421607", "loss": 0.1365, "grad_norm": 1.1671549081802368, "learning_rate": 8.973038290453475e-06, "step_time_sec": 113.87 }, { "step": 1640, "epoch": 0.4642932974732819, "wallclock": "2026-05-23T03:33:12.304973", "loss": 0.1158, "grad_norm": 0.8376030325889587, "learning_rate": 8.966068815400108e-06, "step_time_sec": 114.88 }, { "step": 1645, "epoch": 0.465708825819237, "wallclock": "2026-05-23T03:35:06.915657", "loss": 0.1276, "grad_norm": 0.9669609069824219, "learning_rate": 8.95907849711291e-06, "step_time_sec": 114.61 }, { "step": 1650, "epoch": 0.46712435416519216, "wallclock": "2026-05-23T03:36:59.993882", "loss": 0.1638, "grad_norm": 1.0771512985229492, "learning_rate": 8.952067372328726e-06, "step_time_sec": 113.08, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1655, "epoch": 0.4685398825111473, "wallclock": "2026-05-23T03:38:54.654794", "loss": 0.1403, "grad_norm": 0.7746709585189819, "learning_rate": 8.94503547789374e-06, "step_time_sec": 114.66 }, { "step": 1660, "epoch": 0.4699554108571024, "wallclock": "2026-05-23T03:40:47.102060", "loss": 0.1352, "grad_norm": 1.1372244358062744, "learning_rate": 8.937982850763293e-06, "step_time_sec": 112.45 }, { "step": 1665, "epoch": 0.47137093920305756, "wallclock": "2026-05-23T03:42:40.432111", "loss": 0.1537, "grad_norm": 0.8946406245231628, "learning_rate": 8.930909528001682e-06, "step_time_sec": 113.33 }, { "step": 1670, "epoch": 0.47278646754901266, "wallclock": "2026-05-23T03:44:34.489209", "loss": 0.1252, "grad_norm": 0.6626783013343811, "learning_rate": 8.923815546781968e-06, "step_time_sec": 114.06 }, { "step": 1675, "epoch": 0.4742019958949678, "wallclock": "2026-05-23T03:46:29.929407", "loss": 0.1148, "grad_norm": 0.7032930850982666, "learning_rate": 8.916700944385783e-06, "step_time_sec": 115.44 }, { "step": 1680, "epoch": 0.4756175242409229, "wallclock": "2026-05-23T03:48:23.844510", "loss": 0.139, "grad_norm": 0.9184028506278992, "learning_rate": 8.90956575820313e-06, "step_time_sec": 113.92 }, { "step": 1685, "epoch": 0.47703305258687806, "wallclock": "2026-05-23T03:50:18.747236", "loss": 0.1439, "grad_norm": 0.9489091038703918, "learning_rate": 8.902410025732182e-06, "step_time_sec": 114.9 }, { "step": 1690, "epoch": 0.47844858093283316, "wallclock": "2026-05-23T03:52:12.030744", "loss": 0.1063, "grad_norm": 0.8725413680076599, "learning_rate": 8.895233784579098e-06, "step_time_sec": 113.28 }, { "step": 1695, "epoch": 0.4798641092787883, "wallclock": "2026-05-23T03:54:05.237973", "loss": 0.1254, "grad_norm": 0.8798477649688721, "learning_rate": 8.888037072457817e-06, "step_time_sec": 113.21 }, { "step": 1700, "epoch": 0.48127963762474346, "wallclock": "2026-05-23T03:55:59.391912", "loss": 0.1357, "grad_norm": 0.8217583298683167, "learning_rate": 8.88081992718986e-06, "step_time_sec": 114.15, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1700, "epoch": 0.48127963762474346, "wallclock": "2026-05-23T03:56:51.332412", "eval_loss": 0.14282415807247162, "eval_runtime": 51.857, "eval_samples_per_second": 4.821, "eval_steps_per_second": 1.215, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1705, "epoch": 0.48269516597069856, "wallclock": "2026-05-23T04:00:27.956136", "loss": 0.1428, "grad_norm": 0.7931806445121765, "learning_rate": 8.873582386704132e-06, "step_time_sec": 268.56 }, { "step": 1710, "epoch": 0.4841106943166537, "wallclock": "2026-05-23T04:02:22.110676", "loss": 0.1402, "grad_norm": 1.0113517045974731, "learning_rate": 8.86632448903672e-06, "step_time_sec": 114.15 }, { "step": 1715, "epoch": 0.4855262226626088, "wallclock": "2026-05-23T04:04:17.103828", "loss": 0.1213, "grad_norm": 0.9483981132507324, "learning_rate": 8.859046272330698e-06, "step_time_sec": 114.99 }, { "step": 1720, "epoch": 0.48694175100856396, "wallclock": "2026-05-23T04:06:09.837485", "loss": 0.1287, "grad_norm": 0.8060489296913147, "learning_rate": 8.851747774835927e-06, "step_time_sec": 112.73 }, { "step": 1725, "epoch": 0.48835727935451906, "wallclock": "2026-05-23T04:08:03.048184", "loss": 0.1348, "grad_norm": 1.2514666318893433, "learning_rate": 8.84442903490885e-06, "step_time_sec": 113.21 }, { "step": 1730, "epoch": 0.4897728077004742, "wallclock": "2026-05-23T04:09:56.786981", "loss": 0.1261, "grad_norm": 0.8523698449134827, "learning_rate": 8.837090091012289e-06, "step_time_sec": 113.74 }, { "step": 1735, "epoch": 0.4911883360464293, "wallclock": "2026-05-23T04:11:50.314356", "loss": 0.1365, "grad_norm": 1.0180977582931519, "learning_rate": 8.82973098171525e-06, "step_time_sec": 113.53 }, { "step": 1740, "epoch": 0.49260386439238446, "wallclock": "2026-05-23T04:13:43.729160", "loss": 0.1338, "grad_norm": 0.5706004500389099, "learning_rate": 8.822351745692714e-06, "step_time_sec": 113.41 }, { "step": 1745, "epoch": 0.49401939273833956, "wallclock": "2026-05-23T04:15:36.906607", "loss": 0.1225, "grad_norm": 0.8971516489982605, "learning_rate": 8.814952421725434e-06, "step_time_sec": 113.18 }, { "step": 1750, "epoch": 0.4954349210842947, "wallclock": "2026-05-23T04:17:31.144814", "loss": 0.1199, "grad_norm": 0.8799176812171936, "learning_rate": 8.807533048699734e-06, "step_time_sec": 114.24, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1755, "epoch": 0.49685044943024986, "wallclock": "2026-05-23T04:19:24.124359", "loss": 0.1161, "grad_norm": 0.7670193910598755, "learning_rate": 8.800093665607307e-06, "step_time_sec": 112.98 }, { "step": 1760, "epoch": 0.49826597777620496, "wallclock": "2026-05-23T04:21:16.253579", "loss": 0.1362, "grad_norm": 1.0961898565292358, "learning_rate": 8.792634311545002e-06, "step_time_sec": 112.13 }, { "step": 1765, "epoch": 0.4996815061221601, "wallclock": "2026-05-23T04:23:08.900369", "loss": 0.1246, "grad_norm": 0.9300926923751831, "learning_rate": 8.785155025714626e-06, "step_time_sec": 112.65 }, { "step": 1770, "epoch": 0.5010970344681153, "wallclock": "2026-05-23T04:25:01.641415", "loss": 0.13, "grad_norm": 0.9323188066482544, "learning_rate": 8.777655847422734e-06, "step_time_sec": 112.74 }, { "step": 1775, "epoch": 0.5025125628140703, "wallclock": "2026-05-23T04:26:53.919382", "loss": 0.1228, "grad_norm": 0.8098039627075195, "learning_rate": 8.770136816080426e-06, "step_time_sec": 112.28 }, { "step": 1780, "epoch": 0.5039280911600255, "wallclock": "2026-05-23T04:28:47.742000", "loss": 0.1395, "grad_norm": 0.857759952545166, "learning_rate": 8.76259797120313e-06, "step_time_sec": 113.82 }, { "step": 1785, "epoch": 0.5053436195059806, "wallclock": "2026-05-23T04:30:40.247364", "loss": 0.1259, "grad_norm": 0.847581148147583, "learning_rate": 8.755039352410414e-06, "step_time_sec": 112.51 }, { "step": 1790, "epoch": 0.5067591478519358, "wallclock": "2026-05-23T04:32:33.726589", "loss": 0.1352, "grad_norm": 0.7166717052459717, "learning_rate": 8.747460999425755e-06, "step_time_sec": 113.48 }, { "step": 1795, "epoch": 0.5081746761978909, "wallclock": "2026-05-23T04:34:27.718052", "loss": 0.1319, "grad_norm": 1.0256786346435547, "learning_rate": 8.739862952076346e-06, "step_time_sec": 113.99 }, { "step": 1800, "epoch": 0.509590204543846, "wallclock": "2026-05-23T04:36:20.348096", "loss": 0.1174, "grad_norm": 0.7882758975028992, "learning_rate": 8.732245250292878e-06, "step_time_sec": 112.63, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1800, "epoch": 0.509590204543846, "wallclock": "2026-05-23T04:37:12.805799", "eval_loss": 0.14175137877464294, "eval_runtime": 52.3687, "eval_samples_per_second": 4.774, "eval_steps_per_second": 1.203, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1805, "epoch": 0.5110057328898011, "wallclock": "2026-05-23T04:40:48.723033", "loss": 0.1259, "grad_norm": 0.9180939793586731, "learning_rate": 8.72460793410934e-06, "step_time_sec": 268.37 }, { "step": 1810, "epoch": 0.5124212612357563, "wallclock": "2026-05-23T04:42:42.010667", "loss": 0.1238, "grad_norm": 0.8965495228767395, "learning_rate": 8.716951043662796e-06, "step_time_sec": 113.29 }, { "step": 1815, "epoch": 0.5138367895817114, "wallclock": "2026-05-23T04:44:35.309189", "loss": 0.1064, "grad_norm": 0.9334513545036316, "learning_rate": 8.709274619193182e-06, "step_time_sec": 113.3 }, { "step": 1820, "epoch": 0.5152523179276665, "wallclock": "2026-05-23T04:46:29.001032", "loss": 0.1171, "grad_norm": 0.7548913955688477, "learning_rate": 8.701578701043097e-06, "step_time_sec": 113.69 }, { "step": 1825, "epoch": 0.5166678462736216, "wallclock": "2026-05-23T04:48:22.554066", "loss": 0.1248, "grad_norm": 1.022698998451233, "learning_rate": 8.693863329657576e-06, "step_time_sec": 113.55 }, { "step": 1830, "epoch": 0.5180833746195768, "wallclock": "2026-05-23T04:50:15.543925", "loss": 0.1423, "grad_norm": 1.0240012407302856, "learning_rate": 8.686128545583906e-06, "step_time_sec": 112.99 }, { "step": 1835, "epoch": 0.5194989029655319, "wallclock": "2026-05-23T04:52:08.302700", "loss": 0.1373, "grad_norm": 1.0934542417526245, "learning_rate": 8.678374389471375e-06, "step_time_sec": 112.76 }, { "step": 1840, "epoch": 0.520914431311487, "wallclock": "2026-05-23T04:54:00.947870", "loss": 0.1463, "grad_norm": 1.0597333908081055, "learning_rate": 8.670600902071096e-06, "step_time_sec": 112.65 }, { "step": 1845, "epoch": 0.5223299596574421, "wallclock": "2026-05-23T04:55:54.818374", "loss": 0.1206, "grad_norm": 0.7178345918655396, "learning_rate": 8.662808124235765e-06, "step_time_sec": 113.87 }, { "step": 1850, "epoch": 0.5237454880033973, "wallclock": "2026-05-23T04:57:48.527340", "loss": 0.1075, "grad_norm": 1.275473952293396, "learning_rate": 8.65499609691946e-06, "step_time_sec": 113.71, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1855, "epoch": 0.5251610163493524, "wallclock": "2026-05-23T04:59:41.867913", "loss": 0.1023, "grad_norm": 0.5519967675209045, "learning_rate": 8.647164861177422e-06, "step_time_sec": 113.34 }, { "step": 1860, "epoch": 0.5265765446953076, "wallclock": "2026-05-23T05:01:36.546653", "loss": 0.1367, "grad_norm": 0.9184526205062866, "learning_rate": 8.639314458165839e-06, "step_time_sec": 114.68 }, { "step": 1865, "epoch": 0.5279920730412626, "wallclock": "2026-05-23T05:03:30.220915", "loss": 0.1332, "grad_norm": 0.938758373260498, "learning_rate": 8.631444929141635e-06, "step_time_sec": 113.67 }, { "step": 1870, "epoch": 0.5294076013872178, "wallclock": "2026-05-23T05:05:24.720616", "loss": 0.107, "grad_norm": 0.8511345982551575, "learning_rate": 8.62355631546224e-06, "step_time_sec": 114.5 }, { "step": 1875, "epoch": 0.5308231297331729, "wallclock": "2026-05-23T05:07:19.388697", "loss": 0.1276, "grad_norm": 1.1140179634094238, "learning_rate": 8.615648658585392e-06, "step_time_sec": 114.67 }, { "step": 1880, "epoch": 0.5322386580791281, "wallclock": "2026-05-23T05:09:13.975351", "loss": 0.117, "grad_norm": 0.6539268493652344, "learning_rate": 8.607722000068898e-06, "step_time_sec": 114.59 }, { "step": 1885, "epoch": 0.5336541864250831, "wallclock": "2026-05-23T05:11:08.325687", "loss": 0.1193, "grad_norm": 0.8391310572624207, "learning_rate": 8.599776381570433e-06, "step_time_sec": 114.35 }, { "step": 1890, "epoch": 0.5350697147710383, "wallclock": "2026-05-23T05:13:02.941530", "loss": 0.1264, "grad_norm": 0.844965398311615, "learning_rate": 8.59181184484731e-06, "step_time_sec": 114.62 }, { "step": 1895, "epoch": 0.5364852431169934, "wallclock": "2026-05-23T05:14:56.481372", "loss": 0.1396, "grad_norm": 0.7179044485092163, "learning_rate": 8.583828431756272e-06, "step_time_sec": 113.54 }, { "step": 1900, "epoch": 0.5379007714629486, "wallclock": "2026-05-23T05:16:51.210427", "loss": 0.0974, "grad_norm": 0.8166824579238892, "learning_rate": 8.575826184253254e-06, "step_time_sec": 114.73, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1900, "epoch": 0.5379007714629486, "wallclock": "2026-05-23T05:17:43.683064", "eval_loss": 0.14031976461410522, "eval_runtime": 52.3833, "eval_samples_per_second": 4.773, "eval_steps_per_second": 1.203, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1905, "epoch": 0.5393162998089037, "wallclock": "2026-05-23T05:21:18.913048", "loss": 0.1219, "grad_norm": 1.006734013557434, "learning_rate": 8.567805144393176e-06, "step_time_sec": 267.7 }, { "step": 1910, "epoch": 0.5407318281548588, "wallclock": "2026-05-23T05:23:13.218448", "loss": 0.1284, "grad_norm": 0.8619974255561829, "learning_rate": 8.559765354329728e-06, "step_time_sec": 114.31 }, { "step": 1915, "epoch": 0.5421473565008139, "wallclock": "2026-05-23T05:25:08.140980", "loss": 0.1184, "grad_norm": 1.2139092683792114, "learning_rate": 8.55170685631513e-06, "step_time_sec": 114.92 }, { "step": 1920, "epoch": 0.5435628848467691, "wallclock": "2026-05-23T05:27:03.707486", "loss": 0.1129, "grad_norm": 0.9047484397888184, "learning_rate": 8.54362969269992e-06, "step_time_sec": 115.57 }, { "step": 1925, "epoch": 0.5449784131927242, "wallclock": "2026-05-23T05:28:57.612333", "loss": 0.1163, "grad_norm": 0.6891061663627625, "learning_rate": 8.535533905932739e-06, "step_time_sec": 113.9 }, { "step": 1930, "epoch": 0.5463939415386793, "wallclock": "2026-05-23T05:30:52.594285", "loss": 0.1164, "grad_norm": 0.6650737524032593, "learning_rate": 8.527419538560088e-06, "step_time_sec": 114.98 }, { "step": 1935, "epoch": 0.5478094698846344, "wallclock": "2026-05-23T05:32:48.432100", "loss": 0.1187, "grad_norm": 1.1412484645843506, "learning_rate": 8.51928663322613e-06, "step_time_sec": 115.84 }, { "step": 1940, "epoch": 0.5492249982305896, "wallclock": "2026-05-23T05:34:43.177149", "loss": 0.1342, "grad_norm": 0.7133747339248657, "learning_rate": 8.511135232672442e-06, "step_time_sec": 114.75 }, { "step": 1945, "epoch": 0.5506405265765447, "wallclock": "2026-05-23T05:36:42.534792", "loss": 0.1132, "grad_norm": 1.0151540040969849, "learning_rate": 8.502965379737802e-06, "step_time_sec": 119.36 }, { "step": 1950, "epoch": 0.5520560549224999, "wallclock": "2026-05-23T05:38:46.820577", "loss": 0.1273, "grad_norm": 1.6805675029754639, "learning_rate": 8.494777117357964e-06, "step_time_sec": 124.29, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 1955, "epoch": 0.5534715832684549, "wallclock": "2026-05-23T05:40:50.018680", "loss": 0.1142, "grad_norm": 0.9018206596374512, "learning_rate": 8.486570488565432e-06, "step_time_sec": 123.2 }, { "step": 1960, "epoch": 0.5548871116144101, "wallclock": "2026-05-23T05:42:53.671070", "loss": 0.1258, "grad_norm": 0.7533476948738098, "learning_rate": 8.478345536489232e-06, "step_time_sec": 123.65 }, { "step": 1965, "epoch": 0.5563026399603652, "wallclock": "2026-05-23T05:44:56.957810", "loss": 0.1218, "grad_norm": 1.134895920753479, "learning_rate": 8.470102304354685e-06, "step_time_sec": 123.29 }, { "step": 1970, "epoch": 0.5577181683063204, "wallclock": "2026-05-23T05:47:01.054040", "loss": 0.1344, "grad_norm": 0.9846596717834473, "learning_rate": 8.461840835483179e-06, "step_time_sec": 124.1 }, { "step": 1975, "epoch": 0.5591336966522754, "wallclock": "2026-05-23T05:49:04.326418", "loss": 0.1272, "grad_norm": 0.8339362144470215, "learning_rate": 8.45356117329195e-06, "step_time_sec": 123.27 }, { "step": 1980, "epoch": 0.5605492249982306, "wallclock": "2026-05-23T05:51:07.881648", "loss": 0.1041, "grad_norm": 1.041932463645935, "learning_rate": 8.445263361293839e-06, "step_time_sec": 123.56 }, { "step": 1985, "epoch": 0.5619647533441857, "wallclock": "2026-05-23T05:53:11.738690", "loss": 0.1492, "grad_norm": 0.9378158450126648, "learning_rate": 8.436947443097074e-06, "step_time_sec": 123.86 }, { "step": 1990, "epoch": 0.5633802816901409, "wallclock": "2026-05-23T05:55:16.469073", "loss": 0.1055, "grad_norm": 1.0052165985107422, "learning_rate": 8.428613462405042e-06, "step_time_sec": 124.73 }, { "step": 1995, "epoch": 0.564795810036096, "wallclock": "2026-05-23T05:57:21.072731", "loss": 0.1157, "grad_norm": 0.9656962752342224, "learning_rate": 8.42026146301605e-06, "step_time_sec": 124.6 }, { "step": 2000, "epoch": 0.5662113383820511, "wallclock": "2026-05-23T05:59:27.133239", "loss": 0.1099, "grad_norm": 0.6400126814842224, "learning_rate": 8.411891488823102e-06, "step_time_sec": 126.06, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2000, "epoch": 0.5662113383820511, "wallclock": "2026-05-23T06:00:26.330552", "eval_loss": 0.13213595747947693, "eval_runtime": 59.102, "eval_samples_per_second": 4.23, "eval_steps_per_second": 1.066, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2005, "epoch": 0.5676268667280062, "wallclock": "2026-05-23T06:04:02.423533", "loss": 0.1243, "grad_norm": 1.0383392572402954, "learning_rate": 8.40350358381367e-06, "step_time_sec": 275.29 }, { "step": 2010, "epoch": 0.5690423950739614, "wallclock": "2026-05-23T06:05:55.119665", "loss": 0.1192, "grad_norm": 1.1544498205184937, "learning_rate": 8.39509779206945e-06, "step_time_sec": 112.7 }, { "step": 2015, "epoch": 0.5704579234199165, "wallclock": "2026-05-23T06:07:49.815988", "loss": 0.125, "grad_norm": 1.1813828945159912, "learning_rate": 8.386674157766156e-06, "step_time_sec": 114.7 }, { "step": 2020, "epoch": 0.5718734517658716, "wallclock": "2026-05-23T06:09:44.079892", "loss": 0.0941, "grad_norm": 0.582125723361969, "learning_rate": 8.378232725173253e-06, "step_time_sec": 114.26 }, { "step": 2025, "epoch": 0.5732889801118267, "wallclock": "2026-05-23T06:11:37.953666", "loss": 0.1276, "grad_norm": 0.8630328178405762, "learning_rate": 8.369773538653756e-06, "step_time_sec": 113.87 }, { "step": 2030, "epoch": 0.5747045084577819, "wallclock": "2026-05-23T06:13:33.538279", "loss": 0.1139, "grad_norm": 0.7153676748275757, "learning_rate": 8.361296642663977e-06, "step_time_sec": 115.58 }, { "step": 2035, "epoch": 0.576120036803737, "wallclock": "2026-05-23T06:15:28.201077", "loss": 0.1186, "grad_norm": 1.0687501430511475, "learning_rate": 8.352802081753304e-06, "step_time_sec": 114.66 }, { "step": 2040, "epoch": 0.5775355651496922, "wallclock": "2026-05-23T06:17:21.826972", "loss": 0.0957, "grad_norm": 0.7276541590690613, "learning_rate": 8.344289900563955e-06, "step_time_sec": 113.63 }, { "step": 2045, "epoch": 0.5789510934956472, "wallclock": "2026-05-23T06:19:15.755614", "loss": 0.1418, "grad_norm": 1.2831865549087524, "learning_rate": 8.335760143830753e-06, "step_time_sec": 113.93 }, { "step": 2050, "epoch": 0.5803666218416024, "wallclock": "2026-05-23T06:21:10.146824", "loss": 0.0902, "grad_norm": 0.8044394850730896, "learning_rate": 8.327212856380886e-06, "step_time_sec": 114.39, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2055, "epoch": 0.5817821501875575, "wallclock": "2026-05-23T06:23:03.760022", "loss": 0.1254, "grad_norm": 0.8785029053688049, "learning_rate": 8.318648083133675e-06, "step_time_sec": 113.61 }, { "step": 2060, "epoch": 0.5831976785335127, "wallclock": "2026-05-23T06:24:58.159811", "loss": 0.1295, "grad_norm": 0.8821666240692139, "learning_rate": 8.310065869100332e-06, "step_time_sec": 114.4 }, { "step": 2065, "epoch": 0.5846132068794677, "wallclock": "2026-05-23T06:26:51.514557", "loss": 0.1296, "grad_norm": 1.0319464206695557, "learning_rate": 8.301466259383729e-06, "step_time_sec": 113.35 }, { "step": 2070, "epoch": 0.5860287352254229, "wallclock": "2026-05-23T06:28:45.591485", "loss": 0.1134, "grad_norm": 0.7893862724304199, "learning_rate": 8.292849299178158e-06, "step_time_sec": 114.08 }, { "step": 2075, "epoch": 0.587444263571378, "wallclock": "2026-05-23T06:30:40.034770", "loss": 0.1123, "grad_norm": 0.8960036635398865, "learning_rate": 8.284215033769098e-06, "step_time_sec": 114.44 }, { "step": 2080, "epoch": 0.5888597919173332, "wallclock": "2026-05-23T06:32:33.013250", "loss": 0.1019, "grad_norm": 0.7732668519020081, "learning_rate": 8.275563508532972e-06, "step_time_sec": 112.98 }, { "step": 2085, "epoch": 0.5902753202632883, "wallclock": "2026-05-23T06:34:26.451713", "loss": 0.1159, "grad_norm": 1.014701008796692, "learning_rate": 8.266894768936907e-06, "step_time_sec": 113.44 }, { "step": 2090, "epoch": 0.5916908486092434, "wallclock": "2026-05-23T06:36:20.092613", "loss": 0.117, "grad_norm": 1.0048466920852661, "learning_rate": 8.258208860538498e-06, "step_time_sec": 113.64 }, { "step": 2095, "epoch": 0.5931063769551985, "wallclock": "2026-05-23T06:38:13.619925", "loss": 0.1295, "grad_norm": 1.0775166749954224, "learning_rate": 8.249505828985575e-06, "step_time_sec": 113.53 }, { "step": 2100, "epoch": 0.5945219053011537, "wallclock": "2026-05-23T06:40:07.681597", "loss": 0.1198, "grad_norm": 1.339026689529419, "learning_rate": 8.240785720015954e-06, "step_time_sec": 114.06, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2100, "epoch": 0.5945219053011537, "wallclock": "2026-05-23T06:40:59.897976", "eval_loss": 0.1282491832971573, "eval_runtime": 52.1233, "eval_samples_per_second": 4.796, "eval_steps_per_second": 1.209, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2105, "epoch": 0.5959374336471088, "wallclock": "2026-05-23T06:44:35.507980", "loss": 0.0943, "grad_norm": 0.7660958766937256, "learning_rate": 8.232048579457194e-06, "step_time_sec": 267.83 }, { "step": 2110, "epoch": 0.5973529619930639, "wallclock": "2026-05-23T06:46:29.777766", "loss": 0.11, "grad_norm": 0.9617125391960144, "learning_rate": 8.22329445322637e-06, "step_time_sec": 114.27 }, { "step": 2115, "epoch": 0.598768490339019, "wallclock": "2026-05-23T06:48:22.536086", "loss": 0.1132, "grad_norm": 1.1251046657562256, "learning_rate": 8.214523387329815e-06, "step_time_sec": 112.76 }, { "step": 2120, "epoch": 0.6001840186849742, "wallclock": "2026-05-23T06:50:15.285691", "loss": 0.1012, "grad_norm": 0.8359034657478333, "learning_rate": 8.205735427862897e-06, "step_time_sec": 112.75 }, { "step": 2125, "epoch": 0.6015995470309293, "wallclock": "2026-05-23T06:52:10.239923", "loss": 0.0948, "grad_norm": 0.8290632963180542, "learning_rate": 8.196930621009756e-06, "step_time_sec": 114.95 }, { "step": 2130, "epoch": 0.6030150753768844, "wallclock": "2026-05-23T06:54:03.899054", "loss": 0.1103, "grad_norm": 0.707132875919342, "learning_rate": 8.188109013043076e-06, "step_time_sec": 113.66 }, { "step": 2135, "epoch": 0.6044306037228395, "wallclock": "2026-05-23T06:55:56.905229", "loss": 0.111, "grad_norm": 0.940647542476654, "learning_rate": 8.179270650323839e-06, "step_time_sec": 113.01 }, { "step": 2140, "epoch": 0.6058461320687947, "wallclock": "2026-05-23T06:57:51.331282", "loss": 0.1101, "grad_norm": 0.7413908243179321, "learning_rate": 8.170415579301076e-06, "step_time_sec": 114.43 }, { "step": 2145, "epoch": 0.6072616604147498, "wallclock": "2026-05-23T06:59:44.905917", "loss": 0.1021, "grad_norm": 1.1988078355789185, "learning_rate": 8.161543846511628e-06, "step_time_sec": 113.57 }, { "step": 2150, "epoch": 0.608677188760705, "wallclock": "2026-05-23T07:01:39.153468", "loss": 0.1143, "grad_norm": 1.0968750715255737, "learning_rate": 8.152655498579903e-06, "step_time_sec": 114.25, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2155, "epoch": 0.61009271710666, "wallclock": "2026-05-23T07:03:33.733115", "loss": 0.1268, "grad_norm": 0.8552664518356323, "learning_rate": 8.143750582217625e-06, "step_time_sec": 114.58 }, { "step": 2160, "epoch": 0.6115082454526152, "wallclock": "2026-05-23T07:05:27.710732", "loss": 0.1103, "grad_norm": 0.7791701555252075, "learning_rate": 8.13482914422359e-06, "step_time_sec": 113.98 }, { "step": 2165, "epoch": 0.6129237737985703, "wallclock": "2026-05-23T07:07:22.028971", "loss": 0.1155, "grad_norm": 0.7360658645629883, "learning_rate": 8.125891231483425e-06, "step_time_sec": 114.32 }, { "step": 2170, "epoch": 0.6143393021445255, "wallclock": "2026-05-23T07:09:16.562706", "loss": 0.1132, "grad_norm": 1.0679337978363037, "learning_rate": 8.11693689096934e-06, "step_time_sec": 114.53 }, { "step": 2175, "epoch": 0.6157548304904805, "wallclock": "2026-05-23T07:11:10.858404", "loss": 0.129, "grad_norm": 0.9493758082389832, "learning_rate": 8.107966169739871e-06, "step_time_sec": 114.3 }, { "step": 2180, "epoch": 0.6171703588364357, "wallclock": "2026-05-23T07:13:03.638564", "loss": 0.1302, "grad_norm": 0.9018224477767944, "learning_rate": 8.09897911493965e-06, "step_time_sec": 112.78 }, { "step": 2185, "epoch": 0.6185858871823908, "wallclock": "2026-05-23T07:14:57.306827", "loss": 0.1218, "grad_norm": 0.8794463276863098, "learning_rate": 8.089975773799143e-06, "step_time_sec": 113.67 }, { "step": 2190, "epoch": 0.620001415528346, "wallclock": "2026-05-23T07:16:51.323807", "loss": 0.11, "grad_norm": 0.8043993711471558, "learning_rate": 8.080956193634409e-06, "step_time_sec": 114.02 }, { "step": 2195, "epoch": 0.6214169438743011, "wallclock": "2026-05-23T07:18:45.611509", "loss": 0.0976, "grad_norm": 1.1800931692123413, "learning_rate": 8.07192042184685e-06, "step_time_sec": 114.29 }, { "step": 2200, "epoch": 0.6228324722202562, "wallclock": "2026-05-23T07:20:38.621541", "loss": 0.1349, "grad_norm": 1.5049303770065308, "learning_rate": 8.062868505922958e-06, "step_time_sec": 113.01, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2200, "epoch": 0.6228324722202562, "wallclock": "2026-05-23T07:21:30.438443", "eval_loss": 0.12787169218063354, "eval_runtime": 51.72, "eval_samples_per_second": 4.834, "eval_steps_per_second": 1.218, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2205, "epoch": 0.6242480005662113, "wallclock": "2026-05-23T07:25:03.848081", "loss": 0.1167, "grad_norm": 0.8288029432296753, "learning_rate": 8.053800493434072e-06, "step_time_sec": 265.23 }, { "step": 2210, "epoch": 0.6256635289121665, "wallclock": "2026-05-23T07:26:56.740627", "loss": 0.1079, "grad_norm": 0.8604945540428162, "learning_rate": 8.044716432036126e-06, "step_time_sec": 112.89 }, { "step": 2215, "epoch": 0.6270790572581216, "wallclock": "2026-05-23T07:28:54.914996", "loss": 0.1164, "grad_norm": 0.997947633266449, "learning_rate": 8.035616369469392e-06, "step_time_sec": 118.17 }, { "step": 2220, "epoch": 0.6284945856040767, "wallclock": "2026-05-23T07:30:49.126331", "loss": 0.102, "grad_norm": 0.8771962523460388, "learning_rate": 8.02650035355824e-06, "step_time_sec": 114.21 }, { "step": 2225, "epoch": 0.6299101139500318, "wallclock": "2026-05-23T07:32:43.224804", "loss": 0.1177, "grad_norm": 0.909534752368927, "learning_rate": 8.017368432210875e-06, "step_time_sec": 114.1 }, { "step": 2230, "epoch": 0.631325642295987, "wallclock": "2026-05-23T07:34:37.246776", "loss": 0.1316, "grad_norm": 1.185617446899414, "learning_rate": 8.008220653419097e-06, "step_time_sec": 114.02 }, { "step": 2235, "epoch": 0.6327411706419421, "wallclock": "2026-05-23T07:36:31.707708", "loss": 0.0931, "grad_norm": 0.9247961044311523, "learning_rate": 7.99905706525804e-06, "step_time_sec": 114.46 }, { "step": 2240, "epoch": 0.6341566989878973, "wallclock": "2026-05-23T07:38:24.836647", "loss": 0.0937, "grad_norm": 0.9448702931404114, "learning_rate": 7.989877715885925e-06, "step_time_sec": 113.13 }, { "step": 2245, "epoch": 0.6355722273338523, "wallclock": "2026-05-23T07:40:18.101149", "loss": 0.1124, "grad_norm": 0.9247167110443115, "learning_rate": 7.980682653543799e-06, "step_time_sec": 113.26 }, { "step": 2250, "epoch": 0.6369877556798075, "wallclock": "2026-05-23T07:42:13.210519", "loss": 0.1081, "grad_norm": 1.228428602218628, "learning_rate": 7.97147192655529e-06, "step_time_sec": 115.11, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2255, "epoch": 0.6384032840257626, "wallclock": "2026-05-23T07:44:07.337158", "loss": 0.1082, "grad_norm": 0.8680895566940308, "learning_rate": 7.962245583326354e-06, "step_time_sec": 114.13 }, { "step": 2260, "epoch": 0.6398188123717178, "wallclock": "2026-05-23T07:46:01.112153", "loss": 0.1073, "grad_norm": 0.7317308783531189, "learning_rate": 7.953003672345009e-06, "step_time_sec": 113.77 }, { "step": 2265, "epoch": 0.6412343407176728, "wallclock": "2026-05-23T07:47:55.256008", "loss": 0.1213, "grad_norm": 0.9891361594200134, "learning_rate": 7.943746242181091e-06, "step_time_sec": 114.14 }, { "step": 2270, "epoch": 0.642649869063628, "wallclock": "2026-05-23T07:49:48.031699", "loss": 0.1084, "grad_norm": 0.8852012753486633, "learning_rate": 7.934473341485998e-06, "step_time_sec": 112.78 }, { "step": 2275, "epoch": 0.6440653974095831, "wallclock": "2026-05-23T07:51:41.760762", "loss": 0.1015, "grad_norm": 0.6731085181236267, "learning_rate": 7.925185018992426e-06, "step_time_sec": 113.73 }, { "step": 2280, "epoch": 0.6454809257555383, "wallclock": "2026-05-23T07:53:37.755943", "loss": 0.0927, "grad_norm": 0.8080906271934509, "learning_rate": 7.91588132351412e-06, "step_time_sec": 116.0 }, { "step": 2285, "epoch": 0.6468964541014934, "wallclock": "2026-05-23T07:55:31.246122", "loss": 0.117, "grad_norm": 0.9637818336486816, "learning_rate": 7.906562303945622e-06, "step_time_sec": 113.49 }, { "step": 2290, "epoch": 0.6483119824474485, "wallclock": "2026-05-23T07:57:25.355025", "loss": 0.1148, "grad_norm": 0.8999826908111572, "learning_rate": 7.897228009262003e-06, "step_time_sec": 114.11 }, { "step": 2295, "epoch": 0.6497275107934036, "wallclock": "2026-05-23T07:59:20.568291", "loss": 0.1202, "grad_norm": 0.655300498008728, "learning_rate": 7.887878488518608e-06, "step_time_sec": 115.21 }, { "step": 2300, "epoch": 0.6511430391393588, "wallclock": "2026-05-23T08:01:15.440455", "loss": 0.1164, "grad_norm": 1.327991247177124, "learning_rate": 7.878513790850805e-06, "step_time_sec": 114.87, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2300, "epoch": 0.6511430391393588, "wallclock": "2026-05-23T08:02:07.666082", "eval_loss": 0.12934190034866333, "eval_runtime": 52.1298, "eval_samples_per_second": 4.796, "eval_steps_per_second": 1.209, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2305, "epoch": 0.6525585674853139, "wallclock": "2026-05-23T08:05:41.525707", "loss": 0.0931, "grad_norm": 1.3085259199142456, "learning_rate": 7.869133965473723e-06, "step_time_sec": 266.09 }, { "step": 2310, "epoch": 0.653974095831269, "wallclock": "2026-05-23T08:07:39.677572", "loss": 0.1252, "grad_norm": 0.9861677289009094, "learning_rate": 7.859739061681992e-06, "step_time_sec": 118.15 }, { "step": 2315, "epoch": 0.6553896241772241, "wallclock": "2026-05-23T08:09:33.975162", "loss": 0.1131, "grad_norm": 0.685297966003418, "learning_rate": 7.850329128849482e-06, "step_time_sec": 114.3 }, { "step": 2320, "epoch": 0.6568051525231793, "wallclock": "2026-05-23T08:11:28.536426", "loss": 0.1087, "grad_norm": 0.8919675946235657, "learning_rate": 7.840904216429053e-06, "step_time_sec": 114.56 }, { "step": 2325, "epoch": 0.6582206808691344, "wallclock": "2026-05-23T08:13:23.250581", "loss": 0.1037, "grad_norm": 0.9594758152961731, "learning_rate": 7.83146437395228e-06, "step_time_sec": 114.71 }, { "step": 2330, "epoch": 0.6596362092150896, "wallclock": "2026-05-23T08:15:17.659280", "loss": 0.1021, "grad_norm": 0.79726243019104, "learning_rate": 7.82200965102921e-06, "step_time_sec": 114.41 }, { "step": 2335, "epoch": 0.6610517375610446, "wallclock": "2026-05-23T08:17:10.906487", "loss": 0.1267, "grad_norm": 1.4677671194076538, "learning_rate": 7.812540097348085e-06, "step_time_sec": 113.25 }, { "step": 2340, "epoch": 0.6624672659069998, "wallclock": "2026-05-23T08:19:05.623865", "loss": 0.1022, "grad_norm": 0.8115029335021973, "learning_rate": 7.803055762675096e-06, "step_time_sec": 114.72 }, { "step": 2345, "epoch": 0.6638827942529549, "wallclock": "2026-05-23T08:21:00.057684", "loss": 0.097, "grad_norm": 0.7353535890579224, "learning_rate": 7.793556696854105e-06, "step_time_sec": 114.43 }, { "step": 2350, "epoch": 0.6652983225989101, "wallclock": "2026-05-23T08:22:52.623668", "loss": 0.1056, "grad_norm": 0.9155029058456421, "learning_rate": 7.784042949806401e-06, "step_time_sec": 112.57, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2355, "epoch": 0.6667138509448651, "wallclock": "2026-05-23T08:24:46.327258", "loss": 0.119, "grad_norm": 1.1471012830734253, "learning_rate": 7.77451457153042e-06, "step_time_sec": 113.7 }, { "step": 2360, "epoch": 0.6681293792908203, "wallclock": "2026-05-23T08:26:40.729225", "loss": 0.1122, "grad_norm": 1.1479600667953491, "learning_rate": 7.764971612101497e-06, "step_time_sec": 114.4 }, { "step": 2365, "epoch": 0.6695449076367754, "wallclock": "2026-05-23T08:28:34.893479", "loss": 0.1187, "grad_norm": 0.990744411945343, "learning_rate": 7.755414121671596e-06, "step_time_sec": 114.16 }, { "step": 2370, "epoch": 0.6709604359827306, "wallclock": "2026-05-23T08:30:29.655994", "loss": 0.1045, "grad_norm": 0.8785448670387268, "learning_rate": 7.745842150469043e-06, "step_time_sec": 114.76 }, { "step": 2375, "epoch": 0.6723759643286856, "wallclock": "2026-05-23T08:32:24.847718", "loss": 0.1015, "grad_norm": 1.0024092197418213, "learning_rate": 7.736255748798272e-06, "step_time_sec": 115.19 }, { "step": 2380, "epoch": 0.6737914926746408, "wallclock": "2026-05-23T08:34:19.407078", "loss": 0.1087, "grad_norm": 1.0146054029464722, "learning_rate": 7.726654967039546e-06, "step_time_sec": 114.56 }, { "step": 2385, "epoch": 0.6752070210205959, "wallclock": "2026-05-23T08:36:13.103873", "loss": 0.1194, "grad_norm": 1.0869743824005127, "learning_rate": 7.717039855648711e-06, "step_time_sec": 113.7 }, { "step": 2390, "epoch": 0.6766225493665511, "wallclock": "2026-05-23T08:38:07.793063", "loss": 0.1053, "grad_norm": 0.6551274061203003, "learning_rate": 7.707410465156916e-06, "step_time_sec": 114.69 }, { "step": 2395, "epoch": 0.6780380777125062, "wallclock": "2026-05-23T08:40:01.316930", "loss": 0.0985, "grad_norm": 0.9398195147514343, "learning_rate": 7.69776684617035e-06, "step_time_sec": 113.52 }, { "step": 2400, "epoch": 0.6794536060584613, "wallclock": "2026-05-23T08:41:54.704114", "loss": 0.1208, "grad_norm": 1.1209269762039185, "learning_rate": 7.688109049369984e-06, "step_time_sec": 113.39, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2400, "epoch": 0.6794536060584613, "wallclock": "2026-05-23T08:42:47.203641", "eval_loss": 0.11854572594165802, "eval_runtime": 52.4158, "eval_samples_per_second": 4.77, "eval_steps_per_second": 1.202, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2405, "epoch": 0.6808691344044164, "wallclock": "2026-05-23T08:46:24.285045", "loss": 0.1037, "grad_norm": 0.8943494558334351, "learning_rate": 7.678437125511293e-06, "step_time_sec": 269.58 }, { "step": 2410, "epoch": 0.6822846627503716, "wallclock": "2026-05-23T08:48:17.119278", "loss": 0.1201, "grad_norm": 1.3184447288513184, "learning_rate": 7.668751125423997e-06, "step_time_sec": 112.83 }, { "step": 2415, "epoch": 0.6837001910963267, "wallclock": "2026-05-23T08:50:10.316231", "loss": 0.127, "grad_norm": 1.2354567050933838, "learning_rate": 7.659051100011796e-06, "step_time_sec": 113.2 }, { "step": 2420, "epoch": 0.6851157194422818, "wallclock": "2026-05-23T08:52:04.524428", "loss": 0.0854, "grad_norm": 0.7846460342407227, "learning_rate": 7.649337100252091e-06, "step_time_sec": 114.21 }, { "step": 2425, "epoch": 0.6865312477882369, "wallclock": "2026-05-23T08:53:59.953373", "loss": 0.1035, "grad_norm": 0.6973745226860046, "learning_rate": 7.639609177195732e-06, "step_time_sec": 115.43 }, { "step": 2430, "epoch": 0.6879467761341921, "wallclock": "2026-05-23T08:55:54.650826", "loss": 0.1035, "grad_norm": 0.8783355951309204, "learning_rate": 7.629867381966739e-06, "step_time_sec": 114.7 }, { "step": 2435, "epoch": 0.6893623044801472, "wallclock": "2026-05-23T08:57:49.808654", "loss": 0.1103, "grad_norm": 0.8976749777793884, "learning_rate": 7.6201117657620284e-06, "step_time_sec": 115.16 }, { "step": 2440, "epoch": 0.6907778328261024, "wallclock": "2026-05-23T08:59:43.041184", "loss": 0.1041, "grad_norm": 1.3639253377914429, "learning_rate": 7.610342379851159e-06, "step_time_sec": 113.23 }, { "step": 2445, "epoch": 0.6921933611720574, "wallclock": "2026-05-23T09:01:36.414580", "loss": 0.1172, "grad_norm": 1.34951651096344, "learning_rate": 7.600559275576054e-06, "step_time_sec": 113.37 }, { "step": 2450, "epoch": 0.6936088895180126, "wallclock": "2026-05-23T09:03:31.256289", "loss": 0.1272, "grad_norm": 1.2545363903045654, "learning_rate": 7.590762504350729e-06, "step_time_sec": 114.84, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 69.86 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2455, "epoch": 0.6950244178639677, "wallclock": "2026-05-23T09:05:26.347778", "loss": 0.1098, "grad_norm": 0.902570903301239, "learning_rate": 7.580952117661028e-06, "step_time_sec": 115.09 }, { "step": 2460, "epoch": 0.6964399462099229, "wallclock": "2026-05-23T09:07:22.291465", "loss": 0.1261, "grad_norm": 1.299424171447754, "learning_rate": 7.571128167064347e-06, "step_time_sec": 115.94 }, { "step": 2465, "epoch": 0.6978554745558779, "wallclock": "2026-05-23T09:09:16.390275", "loss": 0.1101, "grad_norm": 0.9918133020401001, "learning_rate": 7.5612907041893645e-06, "step_time_sec": 114.1 }, { "step": 2470, "epoch": 0.6992710029018331, "wallclock": "2026-05-23T09:11:10.300186", "loss": 0.0887, "grad_norm": 0.9212543964385986, "learning_rate": 7.551439780735775e-06, "step_time_sec": 113.91 }, { "step": 2475, "epoch": 0.7006865312477882, "wallclock": "2026-05-23T09:13:02.291441", "loss": 0.1198, "grad_norm": 1.1632072925567627, "learning_rate": 7.541575448474012e-06, "step_time_sec": 111.99 }, { "step": 2480, "epoch": 0.7021020595937434, "wallclock": "2026-05-23T09:14:55.310823", "loss": 0.0919, "grad_norm": 0.9132311940193176, "learning_rate": 7.531697759244978e-06, "step_time_sec": 113.02 }, { "step": 2485, "epoch": 0.7035175879396985, "wallclock": "2026-05-23T09:16:49.595016", "loss": 0.1046, "grad_norm": 0.9931870698928833, "learning_rate": 7.521806764959769e-06, "step_time_sec": 114.28 }, { "step": 2490, "epoch": 0.7049331162856536, "wallclock": "2026-05-23T09:18:43.462544", "loss": 0.0934, "grad_norm": 0.810712993144989, "learning_rate": 7.511902517599407e-06, "step_time_sec": 113.87 }, { "step": 2495, "epoch": 0.7063486446316087, "wallclock": "2026-05-23T09:20:37.403219", "loss": 0.1027, "grad_norm": 1.004841685295105, "learning_rate": 7.501985069214561e-06, "step_time_sec": 113.94 }, { "step": 2500, "epoch": 0.7077641729775639, "wallclock": "2026-05-23T09:22:33.235203", "loss": 0.0982, "grad_norm": 0.7684575319290161, "learning_rate": 7.492054471925282e-06, "step_time_sec": 115.83, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2500, "epoch": 0.7077641729775639, "wallclock": "2026-05-23T09:23:26.146278", "eval_loss": 0.11603201180696487, "eval_runtime": 52.8156, "eval_samples_per_second": 4.733, "eval_steps_per_second": 1.193, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2505, "epoch": 0.709179701323519, "wallclock": "2026-05-23T09:27:05.152438", "loss": 0.1083, "grad_norm": 0.8736166954040527, "learning_rate": 7.482110777920719e-06, "step_time_sec": 271.92 }, { "step": 2510, "epoch": 0.7105952296694741, "wallclock": "2026-05-23T09:28:58.645822", "loss": 0.1197, "grad_norm": 1.1975699663162231, "learning_rate": 7.472154039458851e-06, "step_time_sec": 113.49 }, { "step": 2515, "epoch": 0.7120107580154292, "wallclock": "2026-05-23T09:30:54.443603", "loss": 0.1261, "grad_norm": 1.4840281009674072, "learning_rate": 7.462184308866209e-06, "step_time_sec": 115.8 }, { "step": 2520, "epoch": 0.7134262863613844, "wallclock": "2026-05-23T09:32:48.921941", "loss": 0.1001, "grad_norm": 0.9024205803871155, "learning_rate": 7.452201638537605e-06, "step_time_sec": 114.48 }, { "step": 2525, "epoch": 0.7148418147073395, "wallclock": "2026-05-23T09:34:43.594377", "loss": 0.0883, "grad_norm": 2.425753355026245, "learning_rate": 7.442206080935852e-06, "step_time_sec": 114.67 }, { "step": 2530, "epoch": 0.7162573430532947, "wallclock": "2026-05-23T09:36:38.043629", "loss": 0.1033, "grad_norm": 0.9202796816825867, "learning_rate": 7.432197688591494e-06, "step_time_sec": 114.45 }, { "step": 2535, "epoch": 0.7176728713992497, "wallclock": "2026-05-23T09:38:33.443082", "loss": 0.1229, "grad_norm": 0.8916212320327759, "learning_rate": 7.422176514102524e-06, "step_time_sec": 115.4 }, { "step": 2540, "epoch": 0.7190883997452049, "wallclock": "2026-05-23T09:40:26.131948", "loss": 0.0948, "grad_norm": 0.7314426898956299, "learning_rate": 7.41214261013411e-06, "step_time_sec": 112.69 }, { "step": 2545, "epoch": 0.72050392809116, "wallclock": "2026-05-23T09:42:19.625497", "loss": 0.1031, "grad_norm": 1.2673311233520508, "learning_rate": 7.402096029418317e-06, "step_time_sec": 113.49 }, { "step": 2550, "epoch": 0.7219194564371152, "wallclock": "2026-05-23T09:44:13.852248", "loss": 0.1199, "grad_norm": 0.9767388701438904, "learning_rate": 7.3920368247538384e-06, "step_time_sec": 114.23, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2555, "epoch": 0.7233349847830702, "wallclock": "2026-05-23T09:46:07.447283", "loss": 0.1087, "grad_norm": 1.0202505588531494, "learning_rate": 7.381965049005703e-06, "step_time_sec": 113.6 }, { "step": 2560, "epoch": 0.7247505131290254, "wallclock": "2026-05-23T09:48:01.720028", "loss": 0.0971, "grad_norm": 1.1415823698043823, "learning_rate": 7.371880755105008e-06, "step_time_sec": 114.27 }, { "step": 2565, "epoch": 0.7261660414749805, "wallclock": "2026-05-23T09:49:56.656471", "loss": 0.1001, "grad_norm": 1.0273898839950562, "learning_rate": 7.361783996048641e-06, "step_time_sec": 114.94 }, { "step": 2570, "epoch": 0.7275815698209357, "wallclock": "2026-05-23T09:51:49.820193", "loss": 0.1057, "grad_norm": 1.1736416816711426, "learning_rate": 7.3516748248989955e-06, "step_time_sec": 113.16 }, { "step": 2575, "epoch": 0.7289970981668908, "wallclock": "2026-05-23T09:53:43.050372", "loss": 0.1056, "grad_norm": 0.8515759706497192, "learning_rate": 7.341553294783699e-06, "step_time_sec": 113.23 }, { "step": 2580, "epoch": 0.7304126265128459, "wallclock": "2026-05-23T09:55:37.522835", "loss": 0.1058, "grad_norm": 0.8394744992256165, "learning_rate": 7.3314194588953256e-06, "step_time_sec": 114.47 }, { "step": 2585, "epoch": 0.731828154858801, "wallclock": "2026-05-23T09:57:31.953180", "loss": 0.1082, "grad_norm": 0.7621601819992065, "learning_rate": 7.3212733704911235e-06, "step_time_sec": 114.43 }, { "step": 2590, "epoch": 0.7332436832047562, "wallclock": "2026-05-23T09:59:25.144746", "loss": 0.1147, "grad_norm": 1.1607191562652588, "learning_rate": 7.311115082892733e-06, "step_time_sec": 113.19 }, { "step": 2595, "epoch": 0.7346592115507113, "wallclock": "2026-05-23T10:01:19.943656", "loss": 0.1141, "grad_norm": 0.9936063289642334, "learning_rate": 7.300944649485908e-06, "step_time_sec": 114.8 }, { "step": 2600, "epoch": 0.7360747398966664, "wallclock": "2026-05-23T10:03:14.923839", "loss": 0.1048, "grad_norm": 0.7679593563079834, "learning_rate": 7.2907621237202275e-06, "step_time_sec": 114.98, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2600, "epoch": 0.7360747398966664, "wallclock": "2026-05-23T10:04:07.231100", "eval_loss": 0.11498851329088211, "eval_runtime": 52.2032, "eval_samples_per_second": 4.789, "eval_steps_per_second": 1.207, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2605, "epoch": 0.7374902682426215, "wallclock": "2026-05-23T10:07:42.813659", "loss": 0.0785, "grad_norm": 0.8581358790397644, "learning_rate": 7.280567559108825e-06, "step_time_sec": 267.89 }, { "step": 2610, "epoch": 0.7389057965885767, "wallclock": "2026-05-23T10:09:36.902094", "loss": 0.0795, "grad_norm": 1.4470053911209106, "learning_rate": 7.270361009228104e-06, "step_time_sec": 114.09 }, { "step": 2615, "epoch": 0.7403213249345318, "wallclock": "2026-05-23T10:11:32.049560", "loss": 0.1029, "grad_norm": 1.1154381036758423, "learning_rate": 7.260142527717449e-06, "step_time_sec": 115.15 }, { "step": 2620, "epoch": 0.741736853280487, "wallclock": "2026-05-23T10:13:28.524157", "loss": 0.114, "grad_norm": 1.143662929534912, "learning_rate": 7.249912168278954e-06, "step_time_sec": 116.47 }, { "step": 2625, "epoch": 0.743152381626442, "wallclock": "2026-05-23T10:15:25.719237", "loss": 0.1157, "grad_norm": 1.3383020162582397, "learning_rate": 7.23966998467714e-06, "step_time_sec": 117.2 }, { "step": 2630, "epoch": 0.7445679099723972, "wallclock": "2026-05-23T10:17:20.106607", "loss": 0.097, "grad_norm": 1.3460333347320557, "learning_rate": 7.229416030738661e-06, "step_time_sec": 114.39 }, { "step": 2635, "epoch": 0.7459834383183523, "wallclock": "2026-05-23T10:19:12.833927", "loss": 0.0934, "grad_norm": 1.0922449827194214, "learning_rate": 7.219150360352032e-06, "step_time_sec": 112.73 }, { "step": 2640, "epoch": 0.7473989666643075, "wallclock": "2026-05-23T10:21:07.756043", "loss": 0.1099, "grad_norm": 0.9513120651245117, "learning_rate": 7.208873027467345e-06, "step_time_sec": 114.92 }, { "step": 2645, "epoch": 0.7488144950102625, "wallclock": "2026-05-23T10:23:00.826108", "loss": 0.1106, "grad_norm": 0.9753119945526123, "learning_rate": 7.198584086095979e-06, "step_time_sec": 113.07 }, { "step": 2650, "epoch": 0.7502300233562177, "wallclock": "2026-05-23T10:24:56.030014", "loss": 0.0936, "grad_norm": 1.4077311754226685, "learning_rate": 7.188283590310322e-06, "step_time_sec": 115.2, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2655, "epoch": 0.7516455517021728, "wallclock": "2026-05-23T10:26:50.125758", "loss": 0.1239, "grad_norm": 0.8350121378898621, "learning_rate": 7.177971594243486e-06, "step_time_sec": 114.1 }, { "step": 2660, "epoch": 0.753061080048128, "wallclock": "2026-05-23T10:28:44.428938", "loss": 0.1024, "grad_norm": 1.0880407094955444, "learning_rate": 7.167648152089017e-06, "step_time_sec": 114.3 }, { "step": 2665, "epoch": 0.754476608394083, "wallclock": "2026-05-23T10:30:38.443004", "loss": 0.1, "grad_norm": 1.0360862016677856, "learning_rate": 7.157313318100622e-06, "step_time_sec": 114.01 }, { "step": 2670, "epoch": 0.7558921367400382, "wallclock": "2026-05-23T10:32:32.126033", "loss": 0.1126, "grad_norm": 1.0407313108444214, "learning_rate": 7.14696714659187e-06, "step_time_sec": 113.68 }, { "step": 2675, "epoch": 0.7573076650859933, "wallclock": "2026-05-23T10:34:25.914981", "loss": 0.0933, "grad_norm": 1.346449613571167, "learning_rate": 7.136609691935914e-06, "step_time_sec": 113.79 }, { "step": 2680, "epoch": 0.7587231934319485, "wallclock": "2026-05-23T10:36:21.156179", "loss": 0.0893, "grad_norm": 1.2292298078536987, "learning_rate": 7.1262410085652075e-06, "step_time_sec": 115.24 }, { "step": 2685, "epoch": 0.7601387217779036, "wallclock": "2026-05-23T10:38:15.006638", "loss": 0.1224, "grad_norm": 1.2744159698486328, "learning_rate": 7.115861150971215e-06, "step_time_sec": 113.85 }, { "step": 2690, "epoch": 0.7615542501238587, "wallclock": "2026-05-23T10:40:09.527798", "loss": 0.0863, "grad_norm": 1.0019073486328125, "learning_rate": 7.105470173704121e-06, "step_time_sec": 114.52 }, { "step": 2695, "epoch": 0.7629697784698138, "wallclock": "2026-05-23T10:42:03.509958", "loss": 0.098, "grad_norm": 1.0547888278961182, "learning_rate": 7.095068131372552e-06, "step_time_sec": 113.98 }, { "step": 2700, "epoch": 0.764385306815769, "wallclock": "2026-05-23T10:44:00.051414", "loss": 0.116, "grad_norm": 0.9419006109237671, "learning_rate": 7.0846550786432885e-06, "step_time_sec": 116.54, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2700, "epoch": 0.764385306815769, "wallclock": "2026-05-23T10:44:52.913063", "eval_loss": 0.110720694065094, "eval_runtime": 52.7686, "eval_samples_per_second": 4.738, "eval_steps_per_second": 1.194, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2705, "epoch": 0.7658008351617241, "wallclock": "2026-05-23T10:48:31.237934", "loss": 0.0998, "grad_norm": 1.0532370805740356, "learning_rate": 7.074231070240969e-06, "step_time_sec": 271.19 }, { "step": 2710, "epoch": 0.7672163635076792, "wallclock": "2026-05-23T10:50:25.703822", "loss": 0.1059, "grad_norm": 1.1707059144973755, "learning_rate": 7.063796160947811e-06, "step_time_sec": 114.47 }, { "step": 2715, "epoch": 0.7686318918536343, "wallclock": "2026-05-23T10:52:20.230127", "loss": 0.0836, "grad_norm": 1.0319560766220093, "learning_rate": 7.0533504056033234e-06, "step_time_sec": 114.53 }, { "step": 2720, "epoch": 0.7700474201995895, "wallclock": "2026-05-23T10:54:15.428683", "loss": 0.0971, "grad_norm": 1.1601600646972656, "learning_rate": 7.042893859104008e-06, "step_time_sec": 115.2 }, { "step": 2725, "epoch": 0.7714629485455446, "wallclock": "2026-05-23T10:56:09.560972", "loss": 0.0808, "grad_norm": 1.0438365936279297, "learning_rate": 7.032426576403084e-06, "step_time_sec": 114.13 }, { "step": 2730, "epoch": 0.7728784768914998, "wallclock": "2026-05-23T10:58:03.848888", "loss": 0.102, "grad_norm": 1.1061596870422363, "learning_rate": 7.021948612510194e-06, "step_time_sec": 114.29 }, { "step": 2735, "epoch": 0.7742940052374548, "wallclock": "2026-05-23T10:59:58.320211", "loss": 0.0984, "grad_norm": 0.7871215343475342, "learning_rate": 7.011460022491111e-06, "step_time_sec": 114.47 }, { "step": 2740, "epoch": 0.77570953358341, "wallclock": "2026-05-23T11:01:53.123512", "loss": 0.0861, "grad_norm": 0.9695367813110352, "learning_rate": 7.000960861467454e-06, "step_time_sec": 114.8 }, { "step": 2745, "epoch": 0.7771250619293651, "wallclock": "2026-05-23T11:03:47.400982", "loss": 0.0988, "grad_norm": 0.9494866132736206, "learning_rate": 6.990451184616399e-06, "step_time_sec": 114.28 }, { "step": 2750, "epoch": 0.7785405902753203, "wallclock": "2026-05-23T11:05:41.439834", "loss": 0.0848, "grad_norm": 0.8476992249488831, "learning_rate": 6.979931047170382e-06, "step_time_sec": 114.04, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2755, "epoch": 0.7799561186212753, "wallclock": "2026-05-23T11:07:35.107884", "loss": 0.0877, "grad_norm": 0.9056459069252014, "learning_rate": 6.969400504416816e-06, "step_time_sec": 113.67 }, { "step": 2760, "epoch": 0.7813716469672305, "wallclock": "2026-05-23T11:09:27.251632", "loss": 0.0942, "grad_norm": 1.1086695194244385, "learning_rate": 6.9588596116978015e-06, "step_time_sec": 112.14 }, { "step": 2765, "epoch": 0.7827871753131856, "wallclock": "2026-05-23T11:11:21.586762", "loss": 0.1023, "grad_norm": 1.655490756034851, "learning_rate": 6.948308424409824e-06, "step_time_sec": 114.34 }, { "step": 2770, "epoch": 0.7842027036591408, "wallclock": "2026-05-23T11:13:15.016276", "loss": 0.1057, "grad_norm": 0.9345031380653381, "learning_rate": 6.937746998003477e-06, "step_time_sec": 113.43 }, { "step": 2775, "epoch": 0.785618232005096, "wallclock": "2026-05-23T11:15:09.371971", "loss": 0.0827, "grad_norm": 1.2220042943954468, "learning_rate": 6.927175387983165e-06, "step_time_sec": 114.36 }, { "step": 2780, "epoch": 0.787033760351051, "wallclock": "2026-05-23T11:17:03.650861", "loss": 0.1028, "grad_norm": 0.8835825324058533, "learning_rate": 6.9165936499068065e-06, "step_time_sec": 114.28 }, { "step": 2785, "epoch": 0.7884492886970061, "wallclock": "2026-05-23T11:18:57.806390", "loss": 0.095, "grad_norm": 1.1001851558685303, "learning_rate": 6.906001839385551e-06, "step_time_sec": 114.16 }, { "step": 2790, "epoch": 0.7898648170429613, "wallclock": "2026-05-23T11:20:52.228547", "loss": 0.0906, "grad_norm": 0.7298992276191711, "learning_rate": 6.895400012083482e-06, "step_time_sec": 114.42 }, { "step": 2795, "epoch": 0.7912803453889165, "wallclock": "2026-05-23T11:22:47.415434", "loss": 0.1135, "grad_norm": 0.8096187710762024, "learning_rate": 6.884788223717326e-06, "step_time_sec": 115.19 }, { "step": 2800, "epoch": 0.7926958737348715, "wallclock": "2026-05-23T11:24:42.100846", "loss": 0.0896, "grad_norm": 0.7147625088691711, "learning_rate": 6.874166530056153e-06, "step_time_sec": 114.69, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2800, "epoch": 0.7926958737348715, "wallclock": "2026-05-23T11:25:35.071421", "eval_loss": 0.10768646746873856, "eval_runtime": 52.8634, "eval_samples_per_second": 4.729, "eval_steps_per_second": 1.192, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2805, "epoch": 0.7941114020808266, "wallclock": "2026-05-23T11:29:12.349456", "loss": 0.104, "grad_norm": 1.1961441040039062, "learning_rate": 6.863534986921098e-06, "step_time_sec": 270.25 }, { "step": 2810, "epoch": 0.7955269304267818, "wallclock": "2026-05-23T11:31:06.641212", "loss": 0.0879, "grad_norm": 0.8926189541816711, "learning_rate": 6.852893650185051e-06, "step_time_sec": 114.29 }, { "step": 2815, "epoch": 0.796942458772737, "wallclock": "2026-05-23T11:33:01.446223", "loss": 0.0953, "grad_norm": 0.6535293459892273, "learning_rate": 6.842242575772374e-06, "step_time_sec": 114.81 }, { "step": 2820, "epoch": 0.7983579871186921, "wallclock": "2026-05-23T11:34:55.451278", "loss": 0.0921, "grad_norm": 1.124362587928772, "learning_rate": 6.831581819658608e-06, "step_time_sec": 114.01 }, { "step": 2825, "epoch": 0.7997735154646471, "wallclock": "2026-05-23T11:36:49.662998", "loss": 0.1037, "grad_norm": 0.7776113152503967, "learning_rate": 6.820911437870169e-06, "step_time_sec": 114.21 }, { "step": 2830, "epoch": 0.8011890438106023, "wallclock": "2026-05-23T11:38:45.282209", "loss": 0.0958, "grad_norm": 1.1590611934661865, "learning_rate": 6.810231486484064e-06, "step_time_sec": 115.62 }, { "step": 2835, "epoch": 0.8026045721565574, "wallclock": "2026-05-23T11:40:40.066510", "loss": 0.0928, "grad_norm": 0.9135128259658813, "learning_rate": 6.79954202162759e-06, "step_time_sec": 114.78 }, { "step": 2840, "epoch": 0.8040201005025126, "wallclock": "2026-05-23T11:42:35.293647", "loss": 0.1171, "grad_norm": 1.5331295728683472, "learning_rate": 6.788843099478041e-06, "step_time_sec": 115.23 }, { "step": 2845, "epoch": 0.8054356288484676, "wallclock": "2026-05-23T11:44:32.410482", "loss": 0.0892, "grad_norm": 0.802897036075592, "learning_rate": 6.778134776262413e-06, "step_time_sec": 117.12 }, { "step": 2850, "epoch": 0.8068511571944228, "wallclock": "2026-05-23T11:46:26.926697", "loss": 0.1081, "grad_norm": 1.0739949941635132, "learning_rate": 6.76741710825711e-06, "step_time_sec": 114.52, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2855, "epoch": 0.808266685540378, "wallclock": "2026-05-23T11:48:20.898137", "loss": 0.0906, "grad_norm": 1.2039380073547363, "learning_rate": 6.756690151787643e-06, "step_time_sec": 113.97 }, { "step": 2860, "epoch": 0.8096822138863331, "wallclock": "2026-05-23T11:50:16.624972", "loss": 0.0765, "grad_norm": 0.9947606325149536, "learning_rate": 6.74595396322834e-06, "step_time_sec": 115.73 }, { "step": 2865, "epoch": 0.8110977422322883, "wallclock": "2026-05-23T11:52:08.742942", "loss": 0.0886, "grad_norm": 1.0721163749694824, "learning_rate": 6.735208599002048e-06, "step_time_sec": 112.12 }, { "step": 2870, "epoch": 0.8125132705782433, "wallclock": "2026-05-23T11:54:01.649122", "loss": 0.0956, "grad_norm": 0.9984346628189087, "learning_rate": 6.724454115579832e-06, "step_time_sec": 112.91 }, { "step": 2875, "epoch": 0.8139287989241984, "wallclock": "2026-05-23T11:55:55.986094", "loss": 0.1013, "grad_norm": 0.8976569771766663, "learning_rate": 6.713690569480685e-06, "step_time_sec": 114.34 }, { "step": 2880, "epoch": 0.8153443272701536, "wallclock": "2026-05-23T11:57:50.253656", "loss": 0.11, "grad_norm": 1.3766424655914307, "learning_rate": 6.7029180172712295e-06, "step_time_sec": 114.27 }, { "step": 2885, "epoch": 0.8167598556161088, "wallclock": "2026-05-23T11:59:45.942765", "loss": 0.0871, "grad_norm": 0.7033481597900391, "learning_rate": 6.6921365155654126e-06, "step_time_sec": 115.69 }, { "step": 2890, "epoch": 0.8181753839620638, "wallclock": "2026-05-23T12:01:42.248219", "loss": 0.0872, "grad_norm": 1.1330105066299438, "learning_rate": 6.6813461210242215e-06, "step_time_sec": 116.31 }, { "step": 2895, "epoch": 0.819590912308019, "wallclock": "2026-05-23T12:03:36.053753", "loss": 0.1055, "grad_norm": 1.2184752225875854, "learning_rate": 6.670546890355374e-06, "step_time_sec": 113.81 }, { "step": 2900, "epoch": 0.8210064406539741, "wallclock": "2026-05-23T12:05:30.028128", "loss": 0.1058, "grad_norm": 0.665178656578064, "learning_rate": 6.659738880313025e-06, "step_time_sec": 113.97, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2900, "epoch": 0.8210064406539741, "wallclock": "2026-05-23T12:06:22.012953", "eval_loss": 0.10808777064085007, "eval_runtime": 51.8846, "eval_samples_per_second": 4.818, "eval_steps_per_second": 1.214, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2905, "epoch": 0.8224219689999293, "wallclock": "2026-05-23T12:09:56.926177", "loss": 0.0969, "grad_norm": 1.386168122291565, "learning_rate": 6.648922147697471e-06, "step_time_sec": 266.9 }, { "step": 2910, "epoch": 0.8238374973458844, "wallclock": "2026-05-23T12:11:52.616991", "loss": 0.0787, "grad_norm": 1.0408498048782349, "learning_rate": 6.63809674935485e-06, "step_time_sec": 115.69 }, { "step": 2915, "epoch": 0.8252530256918394, "wallclock": "2026-05-23T12:13:47.509339", "loss": 0.106, "grad_norm": 1.0766488313674927, "learning_rate": 6.6272627421768366e-06, "step_time_sec": 114.89 }, { "step": 2920, "epoch": 0.8266685540377946, "wallclock": "2026-05-23T12:15:43.081952", "loss": 0.0859, "grad_norm": 0.8988505005836487, "learning_rate": 6.616420183100353e-06, "step_time_sec": 115.57 }, { "step": 2925, "epoch": 0.8280840823837498, "wallclock": "2026-05-23T12:17:37.913984", "loss": 0.09, "grad_norm": 1.0285881757736206, "learning_rate": 6.605569129107263e-06, "step_time_sec": 114.83 }, { "step": 2930, "epoch": 0.8294996107297049, "wallclock": "2026-05-23T12:19:32.363447", "loss": 0.0921, "grad_norm": 1.0034139156341553, "learning_rate": 6.594709637224075e-06, "step_time_sec": 114.45 }, { "step": 2935, "epoch": 0.83091513907566, "wallclock": "2026-05-23T12:21:27.369008", "loss": 0.0802, "grad_norm": 0.8240336775779724, "learning_rate": 6.583841764521641e-06, "step_time_sec": 115.01 }, { "step": 2940, "epoch": 0.8323306674216151, "wallclock": "2026-05-23T12:23:23.007495", "loss": 0.095, "grad_norm": 1.2371604442596436, "learning_rate": 6.572965568114859e-06, "step_time_sec": 115.64 }, { "step": 2945, "epoch": 0.8337461957675703, "wallclock": "2026-05-23T12:25:16.996557", "loss": 0.095, "grad_norm": 1.1819149255752563, "learning_rate": 6.562081105162369e-06, "step_time_sec": 113.99 }, { "step": 2950, "epoch": 0.8351617241135254, "wallclock": "2026-05-23T12:27:11.125332", "loss": 0.08, "grad_norm": 1.0016002655029297, "learning_rate": 6.551188432866257e-06, "step_time_sec": 114.13, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 2955, "epoch": 0.8365772524594804, "wallclock": "2026-05-23T12:29:06.207323", "loss": 0.1015, "grad_norm": 1.2419204711914062, "learning_rate": 6.5402876084717514e-06, "step_time_sec": 115.08 }, { "step": 2960, "epoch": 0.8379927808054356, "wallclock": "2026-05-23T12:31:00.634331", "loss": 0.0848, "grad_norm": 1.0234307050704956, "learning_rate": 6.529378689266923e-06, "step_time_sec": 114.43 }, { "step": 2965, "epoch": 0.8394083091513908, "wallclock": "2026-05-23T12:32:54.709674", "loss": 0.1004, "grad_norm": 1.3117458820343018, "learning_rate": 6.518461732582385e-06, "step_time_sec": 114.08 }, { "step": 2970, "epoch": 0.8408238374973459, "wallclock": "2026-05-23T12:34:50.639025", "loss": 0.0861, "grad_norm": 0.6640080213546753, "learning_rate": 6.507536795790989e-06, "step_time_sec": 115.93 }, { "step": 2975, "epoch": 0.8422393658433011, "wallclock": "2026-05-23T12:36:44.816198", "loss": 0.0921, "grad_norm": 0.7706874012947083, "learning_rate": 6.496603936307525e-06, "step_time_sec": 114.18 }, { "step": 2980, "epoch": 0.8436548941892561, "wallclock": "2026-05-23T12:38:41.632400", "loss": 0.0774, "grad_norm": 0.9700288772583008, "learning_rate": 6.4856632115884245e-06, "step_time_sec": 116.82 }, { "step": 2985, "epoch": 0.8450704225352113, "wallclock": "2026-05-23T12:40:37.751686", "loss": 0.0827, "grad_norm": 1.0276799201965332, "learning_rate": 6.4747146791314456e-06, "step_time_sec": 116.12 }, { "step": 2990, "epoch": 0.8464859508811664, "wallclock": "2026-05-23T12:42:31.949658", "loss": 0.1038, "grad_norm": 1.124481439590454, "learning_rate": 6.4637583964753855e-06, "step_time_sec": 114.2 }, { "step": 2995, "epoch": 0.8479014792271216, "wallclock": "2026-05-23T12:44:25.803641", "loss": 0.1034, "grad_norm": 1.4556708335876465, "learning_rate": 6.452794421199772e-06, "step_time_sec": 113.85 }, { "step": 3000, "epoch": 0.8493170075730766, "wallclock": "2026-05-23T12:46:20.662475", "loss": 0.0808, "grad_norm": 0.7637086510658264, "learning_rate": 6.441822810924555e-06, "step_time_sec": 114.86, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3000, "epoch": 0.8493170075730766, "wallclock": "2026-05-23T12:47:14.237339", "eval_loss": 0.10439032316207886, "eval_runtime": 53.4665, "eval_samples_per_second": 4.676, "eval_steps_per_second": 1.178, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3005, "epoch": 0.8507325359190318, "wallclock": "2026-05-23T12:50:47.509195", "loss": 0.0871, "grad_norm": 1.2917267084121704, "learning_rate": 6.430843623309815e-06, "step_time_sec": 266.85 }, { "step": 3010, "epoch": 0.8521480642649869, "wallclock": "2026-05-23T12:52:42.040424", "loss": 0.0832, "grad_norm": 1.129804015159607, "learning_rate": 6.419856916055453e-06, "step_time_sec": 114.53 }, { "step": 3015, "epoch": 0.8535635926109421, "wallclock": "2026-05-23T12:54:39.916099", "loss": 0.0821, "grad_norm": 0.72309809923172, "learning_rate": 6.408862746900884e-06, "step_time_sec": 117.88 }, { "step": 3020, "epoch": 0.8549791209568972, "wallclock": "2026-05-23T12:56:33.338244", "loss": 0.0764, "grad_norm": 0.6796430349349976, "learning_rate": 6.397861173624745e-06, "step_time_sec": 113.42 }, { "step": 3025, "epoch": 0.8563946493028523, "wallclock": "2026-05-23T12:58:26.302630", "loss": 0.0984, "grad_norm": 1.0264241695404053, "learning_rate": 6.386852254044582e-06, "step_time_sec": 112.96 }, { "step": 3030, "epoch": 0.8578101776488074, "wallclock": "2026-05-23T13:00:19.802002", "loss": 0.0875, "grad_norm": 1.4211701154708862, "learning_rate": 6.375836046016547e-06, "step_time_sec": 113.5 }, { "step": 3035, "epoch": 0.8592257059947626, "wallclock": "2026-05-23T13:02:13.721659", "loss": 0.0833, "grad_norm": 1.0724290609359741, "learning_rate": 6.3648126074350955e-06, "step_time_sec": 113.92 }, { "step": 3040, "epoch": 0.8606412343407177, "wallclock": "2026-05-23T13:04:08.151856", "loss": 0.0943, "grad_norm": 0.9527065753936768, "learning_rate": 6.353781996232689e-06, "step_time_sec": 114.43 }, { "step": 3045, "epoch": 0.8620567626866728, "wallclock": "2026-05-23T13:06:02.084910", "loss": 0.0915, "grad_norm": 0.9171473979949951, "learning_rate": 6.342744270379471e-06, "step_time_sec": 113.93 }, { "step": 3050, "epoch": 0.8634722910326279, "wallclock": "2026-05-23T13:07:56.129979", "loss": 0.0772, "grad_norm": 1.1974050998687744, "learning_rate": 6.331699487882987e-06, "step_time_sec": 114.05, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3055, "epoch": 0.8648878193785831, "wallclock": "2026-05-23T13:09:50.014721", "loss": 0.079, "grad_norm": 1.2516535520553589, "learning_rate": 6.320647706787866e-06, "step_time_sec": 113.88 }, { "step": 3060, "epoch": 0.8663033477245382, "wallclock": "2026-05-23T13:11:43.877111", "loss": 0.0819, "grad_norm": 0.8899274468421936, "learning_rate": 6.30958898517551e-06, "step_time_sec": 113.86 }, { "step": 3065, "epoch": 0.8677188760704934, "wallclock": "2026-05-23T13:13:37.608088", "loss": 0.0766, "grad_norm": 0.6620562076568604, "learning_rate": 6.298523381163805e-06, "step_time_sec": 113.73 }, { "step": 3070, "epoch": 0.8691344044164484, "wallclock": "2026-05-23T13:15:31.210530", "loss": 0.0868, "grad_norm": 1.2216447591781616, "learning_rate": 6.287450952906802e-06, "step_time_sec": 113.6 }, { "step": 3075, "epoch": 0.8705499327624036, "wallclock": "2026-05-23T13:17:25.173111", "loss": 0.0823, "grad_norm": 1.8554191589355469, "learning_rate": 6.276371758594416e-06, "step_time_sec": 113.96 }, { "step": 3080, "epoch": 0.8719654611083587, "wallclock": "2026-05-23T13:19:19.783902", "loss": 0.1078, "grad_norm": 1.097886085510254, "learning_rate": 6.265285856452123e-06, "step_time_sec": 114.61 }, { "step": 3085, "epoch": 0.8733809894543139, "wallclock": "2026-05-23T13:21:13.021188", "loss": 0.1032, "grad_norm": 0.9588475227355957, "learning_rate": 6.254193304740648e-06, "step_time_sec": 113.24 }, { "step": 3090, "epoch": 0.8747965178002689, "wallclock": "2026-05-23T13:23:05.522960", "loss": 0.0746, "grad_norm": 0.9044705629348755, "learning_rate": 6.243094161755664e-06, "step_time_sec": 112.5 }, { "step": 3095, "epoch": 0.876212046146224, "wallclock": "2026-05-23T13:25:01.305677", "loss": 0.0996, "grad_norm": 1.350035309791565, "learning_rate": 6.231988485827483e-06, "step_time_sec": 115.78 }, { "step": 3100, "epoch": 0.8776275744921792, "wallclock": "2026-05-23T13:26:54.684490", "loss": 0.0947, "grad_norm": 1.00934898853302, "learning_rate": 6.220876335320752e-06, "step_time_sec": 113.38, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3100, "epoch": 0.8776275744921792, "wallclock": "2026-05-23T13:27:47.250665", "eval_loss": 0.10196959972381592, "eval_runtime": 52.4629, "eval_samples_per_second": 4.765, "eval_steps_per_second": 1.201, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3105, "epoch": 0.8790431028381344, "wallclock": "2026-05-23T13:31:22.745504", "loss": 0.076, "grad_norm": 1.0037931203842163, "learning_rate": 6.209757768634142e-06, "step_time_sec": 268.06 }, { "step": 3110, "epoch": 0.8804586311840895, "wallclock": "2026-05-23T13:33:17.015868", "loss": 0.1029, "grad_norm": 1.383480191230774, "learning_rate": 6.1986328442000425e-06, "step_time_sec": 114.27 }, { "step": 3115, "epoch": 0.8818741595300446, "wallclock": "2026-05-23T13:35:09.821778", "loss": 0.0865, "grad_norm": 1.2162877321243286, "learning_rate": 6.18750162048426e-06, "step_time_sec": 112.81 }, { "step": 3120, "epoch": 0.8832896878759997, "wallclock": "2026-05-23T13:37:02.349154", "loss": 0.0919, "grad_norm": 0.8934468626976013, "learning_rate": 6.176364155985701e-06, "step_time_sec": 112.53 }, { "step": 3125, "epoch": 0.8847052162219549, "wallclock": "2026-05-23T13:38:56.196594", "loss": 0.0939, "grad_norm": 0.848867654800415, "learning_rate": 6.165220509236076e-06, "step_time_sec": 113.85 }, { "step": 3130, "epoch": 0.88612074456791, "wallclock": "2026-05-23T13:40:49.415671", "loss": 0.0871, "grad_norm": 1.3182566165924072, "learning_rate": 6.1540707387995775e-06, "step_time_sec": 113.22 }, { "step": 3135, "epoch": 0.887536272913865, "wallclock": "2026-05-23T13:42:44.238969", "loss": 0.1092, "grad_norm": 1.0121556520462036, "learning_rate": 6.1429149032725875e-06, "step_time_sec": 114.82 }, { "step": 3140, "epoch": 0.8889518012598202, "wallclock": "2026-05-23T13:44:39.595399", "loss": 0.0762, "grad_norm": 1.2405686378479004, "learning_rate": 6.13175306128336e-06, "step_time_sec": 115.36 }, { "step": 3145, "epoch": 0.8903673296057754, "wallclock": "2026-05-23T13:46:34.620333", "loss": 0.0945, "grad_norm": 1.2402104139328003, "learning_rate": 6.120585271491713e-06, "step_time_sec": 115.02 }, { "step": 3150, "epoch": 0.8917828579517305, "wallclock": "2026-05-23T13:48:29.023383", "loss": 0.0857, "grad_norm": 0.910408616065979, "learning_rate": 6.1094115925887235e-06, "step_time_sec": 114.4, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3155, "epoch": 0.8931983862976857, "wallclock": "2026-05-23T13:50:23.390785", "loss": 0.0891, "grad_norm": 0.9833782315254211, "learning_rate": 6.098232083296423e-06, "step_time_sec": 114.37 }, { "step": 3160, "epoch": 0.8946139146436407, "wallclock": "2026-05-23T13:52:17.597934", "loss": 0.0749, "grad_norm": 1.2741199731826782, "learning_rate": 6.087046802367476e-06, "step_time_sec": 114.21 }, { "step": 3165, "epoch": 0.8960294429895959, "wallclock": "2026-05-23T13:54:11.564969", "loss": 0.0801, "grad_norm": 1.4001483917236328, "learning_rate": 6.075855808584886e-06, "step_time_sec": 113.97 }, { "step": 3170, "epoch": 0.897444971335551, "wallclock": "2026-05-23T13:56:05.962888", "loss": 0.0825, "grad_norm": 0.909929633140564, "learning_rate": 6.064659160761676e-06, "step_time_sec": 114.4 }, { "step": 3175, "epoch": 0.8988604996815062, "wallclock": "2026-05-23T13:58:01.036489", "loss": 0.0584, "grad_norm": 0.8718348145484924, "learning_rate": 6.053456917740585e-06, "step_time_sec": 115.07 }, { "step": 3180, "epoch": 0.9002760280274612, "wallclock": "2026-05-23T13:59:54.455386", "loss": 0.0979, "grad_norm": 1.4148125648498535, "learning_rate": 6.042249138393753e-06, "step_time_sec": 113.42 }, { "step": 3185, "epoch": 0.9016915563734164, "wallclock": "2026-05-23T14:01:49.207549", "loss": 0.0914, "grad_norm": 0.9834646582603455, "learning_rate": 6.031035881622422e-06, "step_time_sec": 114.75 }, { "step": 3190, "epoch": 0.9031070847193715, "wallclock": "2026-05-23T14:03:45.252526", "loss": 0.1002, "grad_norm": 1.3153408765792847, "learning_rate": 6.019817206356615e-06, "step_time_sec": 116.04 }, { "step": 3195, "epoch": 0.9045226130653267, "wallclock": "2026-05-23T14:05:40.611460", "loss": 0.0856, "grad_norm": 0.9440031051635742, "learning_rate": 6.008593171554833e-06, "step_time_sec": 115.36 }, { "step": 3200, "epoch": 0.9059381414112817, "wallclock": "2026-05-23T14:07:53.973146", "loss": 0.0969, "grad_norm": 1.2231155633926392, "learning_rate": 5.997363836203744e-06, "step_time_sec": 133.36, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3200, "epoch": 0.9059381414112817, "wallclock": "2026-05-23T14:09:02.006751", "eval_loss": 0.09937935322523117, "eval_runtime": 67.9272, "eval_samples_per_second": 3.68, "eval_steps_per_second": 0.927, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3205, "epoch": 0.9073536697572369, "wallclock": "2026-05-23T14:12:39.937912", "loss": 0.1062, "grad_norm": 1.7902098894119263, "learning_rate": 5.98612925931787e-06, "step_time_sec": 285.96 }, { "step": 3210, "epoch": 0.908769198103192, "wallclock": "2026-05-23T14:14:33.769972", "loss": 0.0924, "grad_norm": 1.207891583442688, "learning_rate": 5.974889499939283e-06, "step_time_sec": 113.83 }, { "step": 3215, "epoch": 0.9101847264491472, "wallclock": "2026-05-23T14:16:28.806370", "loss": 0.0685, "grad_norm": 1.0537505149841309, "learning_rate": 5.96364461713729e-06, "step_time_sec": 115.04 }, { "step": 3220, "epoch": 0.9116002547951023, "wallclock": "2026-05-23T14:18:23.631952", "loss": 0.0881, "grad_norm": 1.5671195983886719, "learning_rate": 5.952394670008119e-06, "step_time_sec": 114.83 }, { "step": 3225, "epoch": 0.9130157831410574, "wallclock": "2026-05-23T14:20:17.612828", "loss": 0.1033, "grad_norm": 1.521396517753601, "learning_rate": 5.94113971767462e-06, "step_time_sec": 113.98 }, { "step": 3230, "epoch": 0.9144313114870125, "wallclock": "2026-05-23T14:22:11.445585", "loss": 0.0618, "grad_norm": 0.9208618402481079, "learning_rate": 5.9298798192859434e-06, "step_time_sec": 113.83 }, { "step": 3235, "epoch": 0.9158468398329677, "wallclock": "2026-05-23T14:24:06.144035", "loss": 0.0873, "grad_norm": 1.1370309591293335, "learning_rate": 5.9186150340172325e-06, "step_time_sec": 114.7 }, { "step": 3240, "epoch": 0.9172623681789228, "wallclock": "2026-05-23T14:25:59.240016", "loss": 0.0803, "grad_norm": 1.02957022190094, "learning_rate": 5.907345421069314e-06, "step_time_sec": 113.1 }, { "step": 3245, "epoch": 0.9186778965248779, "wallclock": "2026-05-23T14:27:52.197012", "loss": 0.0811, "grad_norm": 1.390236496925354, "learning_rate": 5.896071039668388e-06, "step_time_sec": 112.96 }, { "step": 3250, "epoch": 0.920093424870833, "wallclock": "2026-05-23T14:29:46.705683", "loss": 0.0873, "grad_norm": 1.451936960220337, "learning_rate": 5.8847919490657114e-06, "step_time_sec": 114.51, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3255, "epoch": 0.9215089532167882, "wallclock": "2026-05-23T14:31:40.040364", "loss": 0.087, "grad_norm": 1.092909574508667, "learning_rate": 5.873508208537291e-06, "step_time_sec": 113.33 }, { "step": 3260, "epoch": 0.9229244815627433, "wallclock": "2026-05-23T14:33:33.153697", "loss": 0.0701, "grad_norm": 1.0500355958938599, "learning_rate": 5.8622198773835725e-06, "step_time_sec": 113.11 }, { "step": 3265, "epoch": 0.9243400099086985, "wallclock": "2026-05-23T14:35:26.975118", "loss": 0.0858, "grad_norm": 1.4215220212936401, "learning_rate": 5.850927014929124e-06, "step_time_sec": 113.82 }, { "step": 3270, "epoch": 0.9257555382546535, "wallclock": "2026-05-23T14:37:21.438405", "loss": 0.0904, "grad_norm": 1.1870381832122803, "learning_rate": 5.83962968052233e-06, "step_time_sec": 114.46 }, { "step": 3275, "epoch": 0.9271710666006087, "wallclock": "2026-05-23T14:39:15.451984", "loss": 0.0791, "grad_norm": 0.9800876379013062, "learning_rate": 5.828327933535075e-06, "step_time_sec": 114.01 }, { "step": 3280, "epoch": 0.9285865949465638, "wallclock": "2026-05-23T14:41:09.339469", "loss": 0.0825, "grad_norm": 1.2808606624603271, "learning_rate": 5.817021833362434e-06, "step_time_sec": 113.89 }, { "step": 3285, "epoch": 0.930002123292519, "wallclock": "2026-05-23T14:43:02.863576", "loss": 0.1006, "grad_norm": 0.8630105257034302, "learning_rate": 5.805711439422361e-06, "step_time_sec": 113.52 }, { "step": 3290, "epoch": 0.931417651638474, "wallclock": "2026-05-23T14:44:56.304812", "loss": 0.0951, "grad_norm": 1.7691140174865723, "learning_rate": 5.794396811155372e-06, "step_time_sec": 113.44 }, { "step": 3295, "epoch": 0.9328331799844292, "wallclock": "2026-05-23T14:46:51.225086", "loss": 0.0859, "grad_norm": 1.175764799118042, "learning_rate": 5.78307800802424e-06, "step_time_sec": 114.92 }, { "step": 3300, "epoch": 0.9342487083303843, "wallclock": "2026-05-23T14:48:45.011673", "loss": 0.0789, "grad_norm": 1.243912696838379, "learning_rate": 5.771755089513678e-06, "step_time_sec": 113.79, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3300, "epoch": 0.9342487083303843, "wallclock": "2026-05-23T14:49:37.428370", "eval_loss": 0.09591619670391083, "eval_runtime": 52.3234, "eval_samples_per_second": 4.778, "eval_steps_per_second": 1.204, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3305, "epoch": 0.9356642366763395, "wallclock": "2026-05-23T14:53:13.024206", "loss": 0.0852, "grad_norm": 1.0862802267074585, "learning_rate": 5.760428115130021e-06, "step_time_sec": 268.01 }, { "step": 3310, "epoch": 0.9370797650222946, "wallclock": "2026-05-23T14:55:06.032103", "loss": 0.0869, "grad_norm": 1.217329502105713, "learning_rate": 5.749097144400929e-06, "step_time_sec": 113.01 }, { "step": 3315, "epoch": 0.9384952933682497, "wallclock": "2026-05-23T14:57:00.935813", "loss": 0.0695, "grad_norm": 0.9838262796401978, "learning_rate": 5.737762236875057e-06, "step_time_sec": 114.9 }, { "step": 3320, "epoch": 0.9399108217142048, "wallclock": "2026-05-23T14:58:55.441455", "loss": 0.0866, "grad_norm": 1.0086387395858765, "learning_rate": 5.726423452121751e-06, "step_time_sec": 114.51 }, { "step": 3325, "epoch": 0.94132635006016, "wallclock": "2026-05-23T15:00:49.538622", "loss": 0.0939, "grad_norm": 1.14065420627594, "learning_rate": 5.7150808497307345e-06, "step_time_sec": 114.1 }, { "step": 3330, "epoch": 0.9427418784061151, "wallclock": "2026-05-23T15:02:45.245821", "loss": 0.0974, "grad_norm": 1.3234528303146362, "learning_rate": 5.7037344893117956e-06, "step_time_sec": 115.71 }, { "step": 3335, "epoch": 0.9441574067520702, "wallclock": "2026-05-23T15:04:37.650997", "loss": 0.0788, "grad_norm": 1.4045474529266357, "learning_rate": 5.692384430494466e-06, "step_time_sec": 112.41 }, { "step": 3340, "epoch": 0.9455729350980253, "wallclock": "2026-05-23T15:06:32.368133", "loss": 0.0849, "grad_norm": 1.256629228591919, "learning_rate": 5.6810307329277226e-06, "step_time_sec": 114.72 }, { "step": 3345, "epoch": 0.9469884634439805, "wallclock": "2026-05-23T15:08:26.827602", "loss": 0.0824, "grad_norm": 1.130339503288269, "learning_rate": 5.669673456279659e-06, "step_time_sec": 114.46 }, { "step": 3350, "epoch": 0.9484039917899356, "wallclock": "2026-05-23T15:10:21.207468", "loss": 0.0693, "grad_norm": 1.282491683959961, "learning_rate": 5.65831266023718e-06, "step_time_sec": 114.38, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3355, "epoch": 0.9498195201358908, "wallclock": "2026-05-23T15:12:15.228993", "loss": 0.0891, "grad_norm": 1.3946020603179932, "learning_rate": 5.646948404505686e-06, "step_time_sec": 114.02 }, { "step": 3360, "epoch": 0.9512350484818458, "wallclock": "2026-05-23T15:14:09.438263", "loss": 0.0761, "grad_norm": 1.1103034019470215, "learning_rate": 5.635580748808763e-06, "step_time_sec": 114.21 }, { "step": 3365, "epoch": 0.952650576827801, "wallclock": "2026-05-23T15:16:05.154445", "loss": 0.0839, "grad_norm": 1.1873400211334229, "learning_rate": 5.624209752887858e-06, "step_time_sec": 115.72 }, { "step": 3370, "epoch": 0.9540661051737561, "wallclock": "2026-05-23T15:17:59.804004", "loss": 0.0777, "grad_norm": 0.7737529277801514, "learning_rate": 5.612835476501979e-06, "step_time_sec": 114.65 }, { "step": 3375, "epoch": 0.9554816335197113, "wallclock": "2026-05-23T15:19:54.892005", "loss": 0.0812, "grad_norm": 0.9554314613342285, "learning_rate": 5.601457979427369e-06, "step_time_sec": 115.09 }, { "step": 3380, "epoch": 0.9568971618656663, "wallclock": "2026-05-23T15:21:50.036983", "loss": 0.0792, "grad_norm": 1.1392946243286133, "learning_rate": 5.5900773214572016e-06, "step_time_sec": 115.14 }, { "step": 3385, "epoch": 0.9583126902116215, "wallclock": "2026-05-23T15:23:43.201325", "loss": 0.0727, "grad_norm": 1.3224341869354248, "learning_rate": 5.578693562401257e-06, "step_time_sec": 113.16 }, { "step": 3390, "epoch": 0.9597282185575766, "wallclock": "2026-05-23T15:25:36.809850", "loss": 0.0755, "grad_norm": 1.0473873615264893, "learning_rate": 5.567306762085619e-06, "step_time_sec": 113.61 }, { "step": 3395, "epoch": 0.9611437469035318, "wallclock": "2026-05-23T15:27:31.712929", "loss": 0.086, "grad_norm": 0.9381260871887207, "learning_rate": 5.555916980352349e-06, "step_time_sec": 114.9 }, { "step": 3400, "epoch": 0.9625592752494869, "wallclock": "2026-05-23T15:29:26.406120", "loss": 0.0665, "grad_norm": 1.002871036529541, "learning_rate": 5.544524277059179e-06, "step_time_sec": 114.69, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3400, "epoch": 0.9625592752494869, "wallclock": "2026-05-23T15:30:19.956625", "eval_loss": 0.09332611411809921, "eval_runtime": 53.4588, "eval_samples_per_second": 4.676, "eval_steps_per_second": 1.178, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3405, "epoch": 0.963974803595442, "wallclock": "2026-05-23T15:33:54.931063", "loss": 0.0858, "grad_norm": 1.1658086776733398, "learning_rate": 5.5331287120791954e-06, "step_time_sec": 268.52 }, { "step": 3410, "epoch": 0.9653903319413971, "wallclock": "2026-05-23T15:35:48.621597", "loss": 0.0581, "grad_norm": 0.7146378755569458, "learning_rate": 5.5217303453005225e-06, "step_time_sec": 113.69 }, { "step": 3415, "epoch": 0.9668058602873523, "wallclock": "2026-05-23T15:37:42.046818", "loss": 0.0755, "grad_norm": 1.1899656057357788, "learning_rate": 5.51032923662601e-06, "step_time_sec": 113.43 }, { "step": 3420, "epoch": 0.9682213886333074, "wallclock": "2026-05-23T15:39:35.748693", "loss": 0.0946, "grad_norm": 1.0844637155532837, "learning_rate": 5.498925445972918e-06, "step_time_sec": 113.7 }, { "step": 3425, "epoch": 0.9696369169792625, "wallclock": "2026-05-23T15:41:30.464137", "loss": 0.0793, "grad_norm": 1.2279070615768433, "learning_rate": 5.4875190332726e-06, "step_time_sec": 114.72 }, { "step": 3430, "epoch": 0.9710524453252176, "wallclock": "2026-05-23T15:43:26.510045", "loss": 0.0798, "grad_norm": 0.8382053971290588, "learning_rate": 5.476110058470192e-06, "step_time_sec": 116.05 }, { "step": 3435, "epoch": 0.9724679736711728, "wallclock": "2026-05-23T15:45:21.432258", "loss": 0.0813, "grad_norm": 0.9018872976303101, "learning_rate": 5.464698581524292e-06, "step_time_sec": 114.92 }, { "step": 3440, "epoch": 0.9738835020171279, "wallclock": "2026-05-23T15:47:17.457363", "loss": 0.0759, "grad_norm": 1.4535553455352783, "learning_rate": 5.453284662406646e-06, "step_time_sec": 116.03 }, { "step": 3445, "epoch": 0.9752990303630831, "wallclock": "2026-05-23T15:49:11.750836", "loss": 0.0817, "grad_norm": 1.2279826402664185, "learning_rate": 5.4418683611018416e-06, "step_time_sec": 114.29 }, { "step": 3450, "epoch": 0.9767145587090381, "wallclock": "2026-05-23T15:51:05.914914", "loss": 0.074, "grad_norm": 1.2694281339645386, "learning_rate": 5.430449737606978e-06, "step_time_sec": 114.16, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3455, "epoch": 0.9781300870549933, "wallclock": "2026-05-23T15:53:02.864640", "loss": 0.088, "grad_norm": 1.1676980257034302, "learning_rate": 5.4190288519313626e-06, "step_time_sec": 116.95 }, { "step": 3460, "epoch": 0.9795456154009484, "wallclock": "2026-05-23T15:54:57.888514", "loss": 0.0917, "grad_norm": 1.2617217302322388, "learning_rate": 5.407605764096193e-06, "step_time_sec": 115.02 }, { "step": 3465, "epoch": 0.9809611437469036, "wallclock": "2026-05-23T15:56:52.536839", "loss": 0.0839, "grad_norm": 2.16770339012146, "learning_rate": 5.396180534134234e-06, "step_time_sec": 114.65 }, { "step": 3470, "epoch": 0.9823766720928586, "wallclock": "2026-05-23T15:58:47.764833", "loss": 0.077, "grad_norm": 1.0116336345672607, "learning_rate": 5.384753222089515e-06, "step_time_sec": 115.23 }, { "step": 3475, "epoch": 0.9837922004388138, "wallclock": "2026-05-23T16:00:43.362477", "loss": 0.0812, "grad_norm": 1.185133457183838, "learning_rate": 5.373323888017003e-06, "step_time_sec": 115.6 }, { "step": 3480, "epoch": 0.9852077287847689, "wallclock": "2026-05-23T16:02:37.887940", "loss": 0.0719, "grad_norm": 1.3264069557189941, "learning_rate": 5.361892591982291e-06, "step_time_sec": 114.53 }, { "step": 3485, "epoch": 0.9866232571307241, "wallclock": "2026-05-23T16:04:31.882696", "loss": 0.064, "grad_norm": 0.7329959273338318, "learning_rate": 5.350459394061287e-06, "step_time_sec": 113.99 }, { "step": 3490, "epoch": 0.9880387854766791, "wallclock": "2026-05-23T16:06:25.692519", "loss": 0.0819, "grad_norm": 0.8542604446411133, "learning_rate": 5.339024354339892e-06, "step_time_sec": 113.81 }, { "step": 3495, "epoch": 0.9894543138226343, "wallclock": "2026-05-23T16:08:20.147221", "loss": 0.0867, "grad_norm": 1.266552448272705, "learning_rate": 5.327587532913685e-06, "step_time_sec": 114.45 }, { "step": 3500, "epoch": 0.9908698421685894, "wallclock": "2026-05-23T16:10:14.094051", "loss": 0.0898, "grad_norm": 1.8799265623092651, "learning_rate": 5.31614898988761e-06, "step_time_sec": 113.95, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3500, "epoch": 0.9908698421685894, "wallclock": "2026-05-23T16:11:07.029960", "eval_loss": 0.08754169940948486, "eval_runtime": 52.8398, "eval_samples_per_second": 4.731, "eval_steps_per_second": 1.192, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3505, "epoch": 0.9922853705145446, "wallclock": "2026-05-23T16:14:43.226754", "loss": 0.0769, "grad_norm": 1.471697211265564, "learning_rate": 5.3047087853756585e-06, "step_time_sec": 269.13 }, { "step": 3510, "epoch": 0.9937008988604997, "wallclock": "2026-05-23T16:16:37.120854", "loss": 0.0858, "grad_norm": 1.3194319009780884, "learning_rate": 5.2932669795005545e-06, "step_time_sec": 113.89 }, { "step": 3515, "epoch": 0.9951164272064548, "wallclock": "2026-05-23T16:18:30.428191", "loss": 0.0649, "grad_norm": 1.7350393533706665, "learning_rate": 5.281823632393436e-06, "step_time_sec": 113.31 }, { "step": 3520, "epoch": 0.9965319555524099, "wallclock": "2026-05-23T16:20:22.921865", "loss": 0.08, "grad_norm": 1.3547072410583496, "learning_rate": 5.270378804193543e-06, "step_time_sec": 112.49 }, { "step": 3525, "epoch": 0.9979474838983651, "wallclock": "2026-05-23T16:22:17.038164", "loss": 0.0836, "grad_norm": 1.2849969863891602, "learning_rate": 5.258932555047897e-06, "step_time_sec": 114.12 }, { "step": 3530, "epoch": 0.9993630122443202, "wallclock": "2026-05-23T16:24:11.663981", "loss": 0.0811, "grad_norm": 0.9789690971374512, "learning_rate": 5.247484945110988e-06, "step_time_sec": 114.63 }, { "step": 3535, "epoch": 1.0007785405902754, "wallclock": "2026-05-23T16:26:14.307733", "loss": 0.0578, "grad_norm": 0.6540358066558838, "learning_rate": 5.23603603454446e-06, "step_time_sec": 122.64 }, { "step": 3540, "epoch": 1.0021940689362305, "wallclock": "2026-05-23T16:28:05.823018", "loss": 0.0478, "grad_norm": 0.8033650517463684, "learning_rate": 5.2245858835167854e-06, "step_time_sec": 111.52 }, { "step": 3545, "epoch": 1.0036095972821857, "wallclock": "2026-05-23T16:29:56.996787", "loss": 0.0555, "grad_norm": 1.4636964797973633, "learning_rate": 5.213134552202963e-06, "step_time_sec": 111.17 }, { "step": 3550, "epoch": 1.0050251256281406, "wallclock": "2026-05-23T16:31:49.397682", "loss": 0.0424, "grad_norm": 0.8096024990081787, "learning_rate": 5.201682100784194e-06, "step_time_sec": 112.4, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3555, "epoch": 1.0064406539740958, "wallclock": "2026-05-23T16:33:40.456471", "loss": 0.0427, "grad_norm": 1.144333839416504, "learning_rate": 5.190228589447559e-06, "step_time_sec": 111.06 }, { "step": 3560, "epoch": 1.007856182320051, "wallclock": "2026-05-23T16:35:31.855205", "loss": 0.0432, "grad_norm": 1.8258119821548462, "learning_rate": 5.1787740783857164e-06, "step_time_sec": 111.4 }, { "step": 3565, "epoch": 1.009271710666006, "wallclock": "2026-05-23T16:37:24.317151", "loss": 0.0428, "grad_norm": 1.1291868686676025, "learning_rate": 5.167318627796577e-06, "step_time_sec": 112.46 }, { "step": 3570, "epoch": 1.0106872390119612, "wallclock": "2026-05-23T16:39:16.531936", "loss": 0.0518, "grad_norm": 2.1567795276641846, "learning_rate": 5.155862297882985e-06, "step_time_sec": 112.21 }, { "step": 3575, "epoch": 1.0121027673579164, "wallclock": "2026-05-23T16:41:08.942223", "loss": 0.045, "grad_norm": 1.2312395572662354, "learning_rate": 5.1444051488524115e-06, "step_time_sec": 112.41 }, { "step": 3580, "epoch": 1.0135182957038715, "wallclock": "2026-05-23T16:43:01.368164", "loss": 0.0472, "grad_norm": 1.38804030418396, "learning_rate": 5.13294724091663e-06, "step_time_sec": 112.43 }, { "step": 3585, "epoch": 1.0149338240498267, "wallclock": "2026-05-23T16:44:53.319138", "loss": 0.052, "grad_norm": 1.0492668151855469, "learning_rate": 5.1214886342914e-06, "step_time_sec": 111.95 }, { "step": 3590, "epoch": 1.0163493523957818, "wallclock": "2026-05-23T16:46:44.915175", "loss": 0.0447, "grad_norm": 0.9781032204627991, "learning_rate": 5.110029389196155e-06, "step_time_sec": 111.6 }, { "step": 3595, "epoch": 1.0177648807417368, "wallclock": "2026-05-23T16:48:37.406030", "loss": 0.0549, "grad_norm": 1.2402184009552002, "learning_rate": 5.0985695658536875e-06, "step_time_sec": 112.49 }, { "step": 3600, "epoch": 1.019180409087692, "wallclock": "2026-05-23T16:50:28.649642", "loss": 0.0433, "grad_norm": 1.0172066688537598, "learning_rate": 5.08710922448982e-06, "step_time_sec": 111.24, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3600, "epoch": 1.019180409087692, "wallclock": "2026-05-23T16:51:23.450008", "eval_loss": 0.08236898481845856, "eval_runtime": 54.7048, "eval_samples_per_second": 4.57, "eval_steps_per_second": 1.152, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3605, "epoch": 1.020595937433647, "wallclock": "2026-05-23T16:55:03.292833", "loss": 0.0511, "grad_norm": 1.146644949913025, "learning_rate": 5.0756484253331075e-06, "step_time_sec": 274.64 }, { "step": 3610, "epoch": 1.0220114657796022, "wallclock": "2026-05-23T16:56:56.732361", "loss": 0.0483, "grad_norm": 1.9536317586898804, "learning_rate": 5.0641872286145025e-06, "step_time_sec": 113.44 }, { "step": 3615, "epoch": 1.0234269941255574, "wallclock": "2026-05-23T16:58:49.705561", "loss": 0.0479, "grad_norm": 0.8863971829414368, "learning_rate": 5.052725694567052e-06, "step_time_sec": 112.97 }, { "step": 3620, "epoch": 1.0248425224715125, "wallclock": "2026-05-23T17:00:43.544919", "loss": 0.0442, "grad_norm": 0.6922377943992615, "learning_rate": 5.0412638834255755e-06, "step_time_sec": 113.84 }, { "step": 3625, "epoch": 1.0262580508174677, "wallclock": "2026-05-23T17:02:36.269444", "loss": 0.0559, "grad_norm": 1.2457826137542725, "learning_rate": 5.029801855426345e-06, "step_time_sec": 112.72 }, { "step": 3630, "epoch": 1.0276735791634228, "wallclock": "2026-05-23T17:04:30.429515", "loss": 0.0469, "grad_norm": 1.0091979503631592, "learning_rate": 5.018339670806775e-06, "step_time_sec": 114.16 }, { "step": 3635, "epoch": 1.0290891075093778, "wallclock": "2026-05-23T17:06:23.820278", "loss": 0.0491, "grad_norm": 1.115814208984375, "learning_rate": 5.006877389805106e-06, "step_time_sec": 113.39 }, { "step": 3640, "epoch": 1.030504635855333, "wallclock": "2026-05-23T17:08:16.705614", "loss": 0.0435, "grad_norm": 1.3016657829284668, "learning_rate": 4.995415072660077e-06, "step_time_sec": 112.89 }, { "step": 3645, "epoch": 1.031920164201288, "wallclock": "2026-05-23T17:10:11.282102", "loss": 0.0492, "grad_norm": 1.312011957168579, "learning_rate": 4.983952779610626e-06, "step_time_sec": 114.58 }, { "step": 3650, "epoch": 1.0333356925472432, "wallclock": "2026-05-23T17:12:06.127157", "loss": 0.0436, "grad_norm": 0.9364621639251709, "learning_rate": 4.9724905708955575e-06, "step_time_sec": 114.85, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3655, "epoch": 1.0347512208931984, "wallclock": "2026-05-23T17:14:00.741230", "loss": 0.0565, "grad_norm": 1.8892085552215576, "learning_rate": 4.9610285067532345e-06, "step_time_sec": 114.61 }, { "step": 3660, "epoch": 1.0361667492391535, "wallclock": "2026-05-23T17:15:55.594162", "loss": 0.0476, "grad_norm": 0.8621354103088379, "learning_rate": 4.949566647421264e-06, "step_time_sec": 114.85 }, { "step": 3665, "epoch": 1.0375822775851087, "wallclock": "2026-05-23T17:17:49.683934", "loss": 0.0406, "grad_norm": 0.8497494459152222, "learning_rate": 4.938105053136173e-06, "step_time_sec": 114.09 }, { "step": 3670, "epoch": 1.0389978059310638, "wallclock": "2026-05-23T17:19:44.142869", "loss": 0.0365, "grad_norm": 1.4974132776260376, "learning_rate": 4.926643784133095e-06, "step_time_sec": 114.46 }, { "step": 3675, "epoch": 1.040413334277019, "wallclock": "2026-05-23T17:21:38.802035", "loss": 0.0487, "grad_norm": 0.9692957997322083, "learning_rate": 4.915182900645454e-06, "step_time_sec": 114.66 }, { "step": 3680, "epoch": 1.0418288626229741, "wallclock": "2026-05-23T17:23:32.339493", "loss": 0.0506, "grad_norm": 1.0823785066604614, "learning_rate": 4.903722462904653e-06, "step_time_sec": 113.54 }, { "step": 3685, "epoch": 1.043244390968929, "wallclock": "2026-05-23T17:25:27.273367", "loss": 0.0385, "grad_norm": 0.6259887218475342, "learning_rate": 4.892262531139747e-06, "step_time_sec": 114.93 }, { "step": 3690, "epoch": 1.0446599193148842, "wallclock": "2026-05-23T17:27:22.317617", "loss": 0.0461, "grad_norm": 0.6526616811752319, "learning_rate": 4.880803165577132e-06, "step_time_sec": 115.04 }, { "step": 3695, "epoch": 1.0460754476608394, "wallclock": "2026-05-23T17:29:17.202916", "loss": 0.0497, "grad_norm": 1.2579582929611206, "learning_rate": 4.869344426440234e-06, "step_time_sec": 114.89 }, { "step": 3700, "epoch": 1.0474909760067945, "wallclock": "2026-05-23T17:31:10.559777", "loss": 0.0453, "grad_norm": 1.414987325668335, "learning_rate": 4.857886373949179e-06, "step_time_sec": 113.36, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3700, "epoch": 1.0474909760067945, "wallclock": "2026-05-23T17:32:02.738420", "eval_loss": 0.08643540740013123, "eval_runtime": 52.0828, "eval_samples_per_second": 4.8, "eval_steps_per_second": 1.21, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3705, "epoch": 1.0489065043527497, "wallclock": "2026-05-23T17:35:38.532817", "loss": 0.0536, "grad_norm": 1.0652250051498413, "learning_rate": 4.846429068320488e-06, "step_time_sec": 267.97 }, { "step": 3710, "epoch": 1.0503220326987048, "wallclock": "2026-05-23T17:37:30.356092", "loss": 0.0482, "grad_norm": 1.0449877977371216, "learning_rate": 4.834972569766762e-06, "step_time_sec": 111.82 }, { "step": 3715, "epoch": 1.05173756104466, "wallclock": "2026-05-23T17:39:23.408283", "loss": 0.0397, "grad_norm": 0.9513642191886902, "learning_rate": 4.823516938496352e-06, "step_time_sec": 113.05 }, { "step": 3720, "epoch": 1.0531530893906151, "wallclock": "2026-05-23T17:41:16.527974", "loss": 0.0328, "grad_norm": 1.774491548538208, "learning_rate": 4.812062234713054e-06, "step_time_sec": 113.12 }, { "step": 3725, "epoch": 1.05456861773657, "wallclock": "2026-05-23T17:43:08.761213", "loss": 0.0485, "grad_norm": 1.2329373359680176, "learning_rate": 4.800608518615793e-06, "step_time_sec": 112.23 }, { "step": 3730, "epoch": 1.0559841460825252, "wallclock": "2026-05-23T17:45:02.615351", "loss": 0.0527, "grad_norm": 1.060661792755127, "learning_rate": 4.789155850398301e-06, "step_time_sec": 113.85 }, { "step": 3735, "epoch": 1.0573996744284804, "wallclock": "2026-05-23T17:46:56.000441", "loss": 0.0642, "grad_norm": 1.075607180595398, "learning_rate": 4.777704290248799e-06, "step_time_sec": 113.39 }, { "step": 3740, "epoch": 1.0588152027744355, "wallclock": "2026-05-23T17:48:49.574582", "loss": 0.0388, "grad_norm": 0.9697294235229492, "learning_rate": 4.766253898349694e-06, "step_time_sec": 113.57 }, { "step": 3745, "epoch": 1.0602307311203907, "wallclock": "2026-05-23T17:50:41.983236", "loss": 0.0409, "grad_norm": 1.6531593799591064, "learning_rate": 4.754804734877245e-06, "step_time_sec": 112.41 }, { "step": 3750, "epoch": 1.0616462594663458, "wallclock": "2026-05-23T17:52:35.437590", "loss": 0.0355, "grad_norm": 1.1890569925308228, "learning_rate": 4.743356860001256e-06, "step_time_sec": 113.45, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3755, "epoch": 1.063061787812301, "wallclock": "2026-05-23T17:54:28.614407", "loss": 0.0418, "grad_norm": 1.71039879322052, "learning_rate": 4.731910333884766e-06, "step_time_sec": 113.18 }, { "step": 3760, "epoch": 1.0644773161582561, "wallclock": "2026-05-23T17:56:21.627594", "loss": 0.0414, "grad_norm": 2.179187774658203, "learning_rate": 4.720465216683718e-06, "step_time_sec": 113.01 }, { "step": 3765, "epoch": 1.0658928445042113, "wallclock": "2026-05-23T17:58:15.543251", "loss": 0.0499, "grad_norm": 1.6056452989578247, "learning_rate": 4.70902156854665e-06, "step_time_sec": 113.92 }, { "step": 3770, "epoch": 1.0673083728501664, "wallclock": "2026-05-23T18:00:08.857393", "loss": 0.0444, "grad_norm": 1.382399320602417, "learning_rate": 4.697579449614389e-06, "step_time_sec": 113.31 }, { "step": 3775, "epoch": 1.0687239011961214, "wallclock": "2026-05-23T18:02:01.237195", "loss": 0.0639, "grad_norm": 1.6109445095062256, "learning_rate": 4.686138920019717e-06, "step_time_sec": 112.38 }, { "step": 3780, "epoch": 1.0701394295420765, "wallclock": "2026-05-23T18:03:54.810260", "loss": 0.0491, "grad_norm": 1.0975931882858276, "learning_rate": 4.674700039887062e-06, "step_time_sec": 113.57 }, { "step": 3785, "epoch": 1.0715549578880317, "wallclock": "2026-05-23T18:05:49.257162", "loss": 0.0489, "grad_norm": 0.7262698411941528, "learning_rate": 4.6632628693321925e-06, "step_time_sec": 114.45 }, { "step": 3790, "epoch": 1.0729704862339868, "wallclock": "2026-05-23T18:07:42.422565", "loss": 0.0488, "grad_norm": 1.3172861337661743, "learning_rate": 4.651827468461885e-06, "step_time_sec": 113.17 }, { "step": 3795, "epoch": 1.074386014579942, "wallclock": "2026-05-23T18:09:36.139435", "loss": 0.0466, "grad_norm": 1.1381676197052002, "learning_rate": 4.640393897373614e-06, "step_time_sec": 113.72 }, { "step": 3800, "epoch": 1.0758015429258971, "wallclock": "2026-05-23T18:11:30.211725", "loss": 0.0403, "grad_norm": 1.1041913032531738, "learning_rate": 4.628962216155249e-06, "step_time_sec": 114.07, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3800, "epoch": 1.0758015429258971, "wallclock": "2026-05-23T18:12:22.307956", "eval_loss": 0.08441882580518723, "eval_runtime": 51.9948, "eval_samples_per_second": 4.808, "eval_steps_per_second": 1.212, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3805, "epoch": 1.0772170712718523, "wallclock": "2026-05-23T18:15:57.451185", "loss": 0.0379, "grad_norm": 1.2826017141342163, "learning_rate": 4.617532484884715e-06, "step_time_sec": 267.24 }, { "step": 3810, "epoch": 1.0786325996178074, "wallclock": "2026-05-23T18:17:50.121618", "loss": 0.0338, "grad_norm": 1.091307282447815, "learning_rate": 4.606104763629693e-06, "step_time_sec": 112.67 }, { "step": 3815, "epoch": 1.0800481279637624, "wallclock": "2026-05-23T18:19:44.910197", "loss": 0.058, "grad_norm": 1.0848028659820557, "learning_rate": 4.594679112447307e-06, "step_time_sec": 114.79 }, { "step": 3820, "epoch": 1.0814636563097175, "wallclock": "2026-05-23T18:21:40.024155", "loss": 0.049, "grad_norm": 1.1905133724212646, "learning_rate": 4.5832555913837925e-06, "step_time_sec": 115.11 }, { "step": 3825, "epoch": 1.0828791846556727, "wallclock": "2026-05-23T18:23:34.370594", "loss": 0.067, "grad_norm": 1.21793532371521, "learning_rate": 4.571834260474195e-06, "step_time_sec": 114.35 }, { "step": 3830, "epoch": 1.0842947130016278, "wallclock": "2026-05-23T18:25:28.664784", "loss": 0.0395, "grad_norm": 1.1224967241287231, "learning_rate": 4.560415179742052e-06, "step_time_sec": 114.29 }, { "step": 3835, "epoch": 1.085710241347583, "wallclock": "2026-05-23T18:27:22.873783", "loss": 0.0559, "grad_norm": 0.9353971481323242, "learning_rate": 4.5489984091990735e-06, "step_time_sec": 114.21 }, { "step": 3840, "epoch": 1.0871257696935381, "wallclock": "2026-05-23T18:29:17.206871", "loss": 0.0554, "grad_norm": 0.6831589937210083, "learning_rate": 4.537584008844823e-06, "step_time_sec": 114.33 }, { "step": 3845, "epoch": 1.0885412980394933, "wallclock": "2026-05-23T18:31:12.398612", "loss": 0.0525, "grad_norm": 1.0940909385681152, "learning_rate": 4.526172038666419e-06, "step_time_sec": 115.19 }, { "step": 3850, "epoch": 1.0899568263854484, "wallclock": "2026-05-23T18:33:06.117469", "loss": 0.0573, "grad_norm": 0.8475215435028076, "learning_rate": 4.514762558638199e-06, "step_time_sec": 113.72, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3855, "epoch": 1.0913723547314036, "wallclock": "2026-05-23T18:34:59.556884", "loss": 0.0483, "grad_norm": 1.0420924425125122, "learning_rate": 4.503355628721417e-06, "step_time_sec": 113.44 }, { "step": 3860, "epoch": 1.0927878830773585, "wallclock": "2026-05-23T18:36:53.139554", "loss": 0.0524, "grad_norm": 1.5134800672531128, "learning_rate": 4.491951308863926e-06, "step_time_sec": 113.58 }, { "step": 3865, "epoch": 1.0942034114233137, "wallclock": "2026-05-23T18:38:46.961705", "loss": 0.0327, "grad_norm": 1.369831919670105, "learning_rate": 4.480549658999862e-06, "step_time_sec": 113.82 }, { "step": 3870, "epoch": 1.0956189397692688, "wallclock": "2026-05-23T18:40:39.105763", "loss": 0.0467, "grad_norm": 1.45563542842865, "learning_rate": 4.469150739049327e-06, "step_time_sec": 112.14 }, { "step": 3875, "epoch": 1.097034468115224, "wallclock": "2026-05-23T18:42:31.937185", "loss": 0.0471, "grad_norm": 0.9477264881134033, "learning_rate": 4.45775460891808e-06, "step_time_sec": 112.83 }, { "step": 3880, "epoch": 1.0984499964611791, "wallclock": "2026-05-23T18:44:25.360640", "loss": 0.0492, "grad_norm": 0.7854604721069336, "learning_rate": 4.446361328497215e-06, "step_time_sec": 113.42 }, { "step": 3885, "epoch": 1.0998655248071343, "wallclock": "2026-05-23T18:46:17.165445", "loss": 0.0427, "grad_norm": 0.7942948341369629, "learning_rate": 4.434970957662849e-06, "step_time_sec": 111.8 }, { "step": 3890, "epoch": 1.1012810531530894, "wallclock": "2026-05-23T18:48:10.450485", "loss": 0.043, "grad_norm": 1.1920311450958252, "learning_rate": 4.423583556275814e-06, "step_time_sec": 113.29 }, { "step": 3895, "epoch": 1.1026965814990446, "wallclock": "2026-05-23T18:50:04.687540", "loss": 0.0502, "grad_norm": 1.4759620428085327, "learning_rate": 4.41219918418133e-06, "step_time_sec": 114.24 }, { "step": 3900, "epoch": 1.1041121098449997, "wallclock": "2026-05-23T18:51:57.525567", "loss": 0.0514, "grad_norm": 1.4128731489181519, "learning_rate": 4.400817901208697e-06, "step_time_sec": 112.84, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3900, "epoch": 1.1041121098449997, "wallclock": "2026-05-23T18:52:50.285022", "eval_loss": 0.08410802483558655, "eval_runtime": 52.6672, "eval_samples_per_second": 4.747, "eval_steps_per_second": 1.196, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3900, "epoch": 1.1041121098449997, "wallclock": "2026-05-23T18:54:34.984762", "train_runtime": 61042.0543, "train_samples_per_second": 3.703, "train_steps_per_second": 0.116, "total_flos": 1.0157637889163264e+16, "train_loss": 0.0625501875159068, "gpu": [ { "gpu": 0, "mem_allocated_gb": 33.45, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] }, { "step": 3900, "epoch": 1.1041121098449997, "wallclock": "2026-05-23T18:55:51.257083", "eval_loss": 0.08236898481845856, "eval_runtime": 62.4976, "eval_samples_per_second": 4.0, "eval_steps_per_second": 1.008, "gpu": [ { "gpu": 0, "mem_allocated_gb": 39.05, "mem_reserved_gb": 74.24 }, { "gpu": 1, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 2, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 }, { "gpu": 3, "mem_allocated_gb": 0.0, "mem_reserved_gb": 0.0 } ] } ]